minhnhat136 commited on
Commit
b89fed9
·
verified ·
1 Parent(s): 3cce76d

Upload 13 files

Browse files
README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: deepseek-ai/deepseek-coder-1.3b-instruct
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.12.0
adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/data/thanhnd17/llms_finetune/models/models--deepseek-ai--deepseek-coder-1.3b-instruct/snapshots/e063262dac8366fc1f28a4da0ff3c50ea66259ca",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 128,
14
+ "lora_dropout": 0.0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 64,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "o_proj",
24
+ "q_proj",
25
+ "k_proj",
26
+ "v_proj",
27
+ "gate_proj",
28
+ "down_proj",
29
+ "up_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": true
34
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26b7ccb0e9c5b5bab07cc8fbd2ea6fd1cd22935245377c12bb64237a4b6f2c15
3
+ size 239906848
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad2aa0d9caacbfeceba37361f403f991eecfa9e8d41b9a68100e6f0ef83ef0ce
3
+ size 480006786
rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6e8f864f49788ce7b617fae7b4570bd2ed2676424c10afa12f8f2bc0c5d10f2
3
+ size 14768
rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:829aa8a9708a99d0c1af61f5e1134ef8cece61aaa32d89d78cf029b2f0a1a6ec
3
+ size 14768
rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8697f5db1bede9a2a1bf81e1d26ab6e778c8268c124716d046afe55aa7974b1a
3
+ size 14768
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd97df55cc774293fb6901986fbe287436cd2d60336b68b79ac037f945817d58
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|begin▁of▁sentence|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|EOT|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|end▁of▁sentence|>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "32000": {
7
+ "content": "õ",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": false
13
+ },
14
+ "32001": {
15
+ "content": "÷",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": false
21
+ },
22
+ "32002": {
23
+ "content": "Á",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": false
29
+ },
30
+ "32003": {
31
+ "content": "ý",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": false
37
+ },
38
+ "32004": {
39
+ "content": "À",
40
+ "lstrip": false,
41
+ "normalized": true,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": false
45
+ },
46
+ "32005": {
47
+ "content": "ÿ",
48
+ "lstrip": false,
49
+ "normalized": true,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": false
53
+ },
54
+ "32006": {
55
+ "content": "ø",
56
+ "lstrip": false,
57
+ "normalized": true,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": false
61
+ },
62
+ "32007": {
63
+ "content": "ú",
64
+ "lstrip": false,
65
+ "normalized": true,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": false
69
+ },
70
+ "32008": {
71
+ "content": "þ",
72
+ "lstrip": false,
73
+ "normalized": true,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": false
77
+ },
78
+ "32009": {
79
+ "content": "ü",
80
+ "lstrip": false,
81
+ "normalized": true,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": false
85
+ },
86
+ "32010": {
87
+ "content": "ù",
88
+ "lstrip": false,
89
+ "normalized": true,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": false
93
+ },
94
+ "32011": {
95
+ "content": "ö",
96
+ "lstrip": false,
97
+ "normalized": true,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": false
101
+ },
102
+ "32012": {
103
+ "content": "û",
104
+ "lstrip": false,
105
+ "normalized": true,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": false
109
+ },
110
+ "32013": {
111
+ "content": "<|begin▁of▁sentence|>",
112
+ "lstrip": false,
113
+ "normalized": true,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": true
117
+ },
118
+ "32014": {
119
+ "content": "<|end▁of▁sentence|>",
120
+ "lstrip": false,
121
+ "normalized": true,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": true
125
+ },
126
+ "32015": {
127
+ "content": "<|fim▁hole|>",
128
+ "lstrip": false,
129
+ "normalized": true,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": false
133
+ },
134
+ "32016": {
135
+ "content": "<|fim▁begin|>",
136
+ "lstrip": false,
137
+ "normalized": true,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": false
141
+ },
142
+ "32017": {
143
+ "content": "<|fim▁end|>",
144
+ "lstrip": false,
145
+ "normalized": true,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": false
149
+ },
150
+ "32018": {
151
+ "content": "<pad>",
152
+ "lstrip": false,
153
+ "normalized": true,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": false
157
+ },
158
+ "32019": {
159
+ "content": "<|User|>",
160
+ "lstrip": false,
161
+ "normalized": true,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": false
165
+ },
166
+ "32020": {
167
+ "content": "<|Assistant|>",
168
+ "lstrip": false,
169
+ "normalized": true,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": false
173
+ },
174
+ "32021": {
175
+ "content": "<|EOT|>",
176
+ "lstrip": false,
177
+ "normalized": true,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": true
181
+ }
182
+ },
183
+ "bos_token": "<|begin▁of▁sentence|>",
184
+ "chat_template": "{{ '<|begin▁of▁sentence|>' }}{% set system_message = 'You are an AI programming assistant, utilizing the DeepSeek Coder model, developed by DeepSeek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.\n' %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '### Instruction:\n' + content + '\n### Response:' }}{% elif message['role'] == 'assistant' %}{{ '\n' + content + '\n<|EOT|>' + '\n' }}{% endif %}{% endfor %}",
185
+ "clean_up_tokenization_spaces": false,
186
+ "eos_token": "<|EOT|>",
187
+ "legacy": true,
188
+ "model_max_length": 16384,
189
+ "pad_token": "<|end▁of▁sentence|>",
190
+ "padding_side": "right",
191
+ "sp_model_kwargs": {},
192
+ "split_special_tokens": false,
193
+ "tokenizer_class": "LlamaTokenizer",
194
+ "unk_token": null,
195
+ "use_default_system_prompt": false
196
+ }
trainer_state.json ADDED
@@ -0,0 +1,3393 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 8.061420345489443,
5
+ "eval_steps": 500,
6
+ "global_step": 2100,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.019193857965451054,
13
+ "grad_norm": 1.0817455053329468,
14
+ "learning_rate": 4.9999436730259053e-05,
15
+ "loss": 0.2614,
16
+ "num_input_tokens_seen": 933376,
17
+ "step": 5
18
+ },
19
+ {
20
+ "epoch": 0.03838771593090211,
21
+ "grad_norm": 0.6683880686759949,
22
+ "learning_rate": 4.999774694641803e-05,
23
+ "loss": 0.1894,
24
+ "num_input_tokens_seen": 1884800,
25
+ "step": 10
26
+ },
27
+ {
28
+ "epoch": 0.05758157389635317,
29
+ "grad_norm": 0.6495689153671265,
30
+ "learning_rate": 4.999493072462126e-05,
31
+ "loss": 0.1836,
32
+ "num_input_tokens_seen": 2847104,
33
+ "step": 15
34
+ },
35
+ {
36
+ "epoch": 0.07677543186180422,
37
+ "grad_norm": 0.5534822940826416,
38
+ "learning_rate": 4.999098819177214e-05,
39
+ "loss": 0.1759,
40
+ "num_input_tokens_seen": 3780480,
41
+ "step": 20
42
+ },
43
+ {
44
+ "epoch": 0.09596928982725528,
45
+ "grad_norm": 0.5017605423927307,
46
+ "learning_rate": 4.9985919525527434e-05,
47
+ "loss": 0.1705,
48
+ "num_input_tokens_seen": 4742272,
49
+ "step": 25
50
+ },
51
+ {
52
+ "epoch": 0.11516314779270634,
53
+ "grad_norm": 0.4681239128112793,
54
+ "learning_rate": 4.9979724954289244e-05,
55
+ "loss": 0.1722,
56
+ "num_input_tokens_seen": 5687424,
57
+ "step": 30
58
+ },
59
+ {
60
+ "epoch": 0.1343570057581574,
61
+ "grad_norm": 0.48403823375701904,
62
+ "learning_rate": 4.9972404757194736e-05,
63
+ "loss": 0.1696,
64
+ "num_input_tokens_seen": 6600832,
65
+ "step": 35
66
+ },
67
+ {
68
+ "epoch": 0.15355086372360843,
69
+ "grad_norm": 0.48744267225265503,
70
+ "learning_rate": 4.9963959264103544e-05,
71
+ "loss": 0.1638,
72
+ "num_input_tokens_seen": 7545344,
73
+ "step": 40
74
+ },
75
+ {
76
+ "epoch": 0.1727447216890595,
77
+ "grad_norm": 0.4389607608318329,
78
+ "learning_rate": 4.995438885558294e-05,
79
+ "loss": 0.1606,
80
+ "num_input_tokens_seen": 8489728,
81
+ "step": 45
82
+ },
83
+ {
84
+ "epoch": 0.19193857965451055,
85
+ "grad_norm": 0.4652620851993561,
86
+ "learning_rate": 4.994369396289063e-05,
87
+ "loss": 0.1606,
88
+ "num_input_tokens_seen": 9426304,
89
+ "step": 50
90
+ },
91
+ {
92
+ "epoch": 0.21113243761996162,
93
+ "grad_norm": 0.48464274406433105,
94
+ "learning_rate": 4.993187506795538e-05,
95
+ "loss": 0.1661,
96
+ "num_input_tokens_seen": 10372608,
97
+ "step": 55
98
+ },
99
+ {
100
+ "epoch": 0.23032629558541268,
101
+ "grad_norm": 0.4516374468803406,
102
+ "learning_rate": 4.9918932703355256e-05,
103
+ "loss": 0.1629,
104
+ "num_input_tokens_seen": 11298944,
105
+ "step": 60
106
+ },
107
+ {
108
+ "epoch": 0.2495201535508637,
109
+ "grad_norm": 0.44246649742126465,
110
+ "learning_rate": 4.990486745229364e-05,
111
+ "loss": 0.1573,
112
+ "num_input_tokens_seen": 12230784,
113
+ "step": 65
114
+ },
115
+ {
116
+ "epoch": 0.2687140115163148,
117
+ "grad_norm": 0.4362417459487915,
118
+ "learning_rate": 4.9889679948572974e-05,
119
+ "loss": 0.1548,
120
+ "num_input_tokens_seen": 13177856,
121
+ "step": 70
122
+ },
123
+ {
124
+ "epoch": 0.28790786948176583,
125
+ "grad_norm": 0.46021828055381775,
126
+ "learning_rate": 4.987337087656614e-05,
127
+ "loss": 0.1536,
128
+ "num_input_tokens_seen": 14148864,
129
+ "step": 75
130
+ },
131
+ {
132
+ "epoch": 0.30710172744721687,
133
+ "grad_norm": 0.43709027767181396,
134
+ "learning_rate": 4.98559409711857e-05,
135
+ "loss": 0.1615,
136
+ "num_input_tokens_seen": 15136256,
137
+ "step": 80
138
+ },
139
+ {
140
+ "epoch": 0.32629558541266795,
141
+ "grad_norm": 0.4331000745296478,
142
+ "learning_rate": 4.983739101785071e-05,
143
+ "loss": 0.1593,
144
+ "num_input_tokens_seen": 16088064,
145
+ "step": 85
146
+ },
147
+ {
148
+ "epoch": 0.345489443378119,
149
+ "grad_norm": 0.40528881549835205,
150
+ "learning_rate": 4.981772185245135e-05,
151
+ "loss": 0.1509,
152
+ "num_input_tokens_seen": 17043840,
153
+ "step": 90
154
+ },
155
+ {
156
+ "epoch": 0.3646833013435701,
157
+ "grad_norm": 0.42082345485687256,
158
+ "learning_rate": 4.97969343613113e-05,
159
+ "loss": 0.1518,
160
+ "num_input_tokens_seen": 17986816,
161
+ "step": 95
162
+ },
163
+ {
164
+ "epoch": 0.3838771593090211,
165
+ "grad_norm": 0.4149463176727295,
166
+ "learning_rate": 4.977502948114772e-05,
167
+ "loss": 0.1558,
168
+ "num_input_tokens_seen": 18904192,
169
+ "step": 100
170
+ },
171
+ {
172
+ "epoch": 0.40307101727447214,
173
+ "grad_norm": 0.41713905334472656,
174
+ "learning_rate": 4.97520081990291e-05,
175
+ "loss": 0.152,
176
+ "num_input_tokens_seen": 19854720,
177
+ "step": 105
178
+ },
179
+ {
180
+ "epoch": 0.42226487523992323,
181
+ "grad_norm": 0.4443305432796478,
182
+ "learning_rate": 4.9727871552330794e-05,
183
+ "loss": 0.15,
184
+ "num_input_tokens_seen": 20807040,
185
+ "step": 110
186
+ },
187
+ {
188
+ "epoch": 0.44145873320537427,
189
+ "grad_norm": 0.4278104603290558,
190
+ "learning_rate": 4.97026206286882e-05,
191
+ "loss": 0.1563,
192
+ "num_input_tokens_seen": 21742080,
193
+ "step": 115
194
+ },
195
+ {
196
+ "epoch": 0.46065259117082535,
197
+ "grad_norm": 0.41511863470077515,
198
+ "learning_rate": 4.967625656594782e-05,
199
+ "loss": 0.1545,
200
+ "num_input_tokens_seen": 22686464,
201
+ "step": 120
202
+ },
203
+ {
204
+ "epoch": 0.4798464491362764,
205
+ "grad_norm": 0.42981916666030884,
206
+ "learning_rate": 4.964878055211597e-05,
207
+ "loss": 0.1483,
208
+ "num_input_tokens_seen": 23654272,
209
+ "step": 125
210
+ },
211
+ {
212
+ "epoch": 0.4990403071017274,
213
+ "grad_norm": 0.39455243945121765,
214
+ "learning_rate": 4.962019382530521e-05,
215
+ "loss": 0.15,
216
+ "num_input_tokens_seen": 24606720,
217
+ "step": 130
218
+ },
219
+ {
220
+ "epoch": 0.5182341650671785,
221
+ "grad_norm": 0.40956762433052063,
222
+ "learning_rate": 4.959049767367859e-05,
223
+ "loss": 0.1516,
224
+ "num_input_tokens_seen": 25554944,
225
+ "step": 135
226
+ },
227
+ {
228
+ "epoch": 0.5374280230326296,
229
+ "grad_norm": 0.39926445484161377,
230
+ "learning_rate": 4.955969343539162e-05,
231
+ "loss": 0.1521,
232
+ "num_input_tokens_seen": 26515968,
233
+ "step": 140
234
+ },
235
+ {
236
+ "epoch": 0.5566218809980806,
237
+ "grad_norm": 0.4021078050136566,
238
+ "learning_rate": 4.9527782498531915e-05,
239
+ "loss": 0.1535,
240
+ "num_input_tokens_seen": 27450112,
241
+ "step": 145
242
+ },
243
+ {
244
+ "epoch": 0.5758157389635317,
245
+ "grad_norm": 0.40622368454933167,
246
+ "learning_rate": 4.949476630105669e-05,
247
+ "loss": 0.1521,
248
+ "num_input_tokens_seen": 28398592,
249
+ "step": 150
250
+ },
251
+ {
252
+ "epoch": 0.5950095969289827,
253
+ "grad_norm": 0.4052083194255829,
254
+ "learning_rate": 4.946064633072795e-05,
255
+ "loss": 0.1498,
256
+ "num_input_tokens_seen": 29364864,
257
+ "step": 155
258
+ },
259
+ {
260
+ "epoch": 0.6142034548944337,
261
+ "grad_norm": 0.39543649554252625,
262
+ "learning_rate": 4.942542412504543e-05,
263
+ "loss": 0.1499,
264
+ "num_input_tokens_seen": 30316928,
265
+ "step": 160
266
+ },
267
+ {
268
+ "epoch": 0.6333973128598849,
269
+ "grad_norm": 0.37419360876083374,
270
+ "learning_rate": 4.9389101271177355e-05,
271
+ "loss": 0.1479,
272
+ "num_input_tokens_seen": 31287680,
273
+ "step": 165
274
+ },
275
+ {
276
+ "epoch": 0.6525911708253359,
277
+ "grad_norm": 0.40969106554985046,
278
+ "learning_rate": 4.935167940588887e-05,
279
+ "loss": 0.1506,
280
+ "num_input_tokens_seen": 32245888,
281
+ "step": 170
282
+ },
283
+ {
284
+ "epoch": 0.6717850287907869,
285
+ "grad_norm": 0.39717161655426025,
286
+ "learning_rate": 4.9313160215468334e-05,
287
+ "loss": 0.1451,
288
+ "num_input_tokens_seen": 33194368,
289
+ "step": 175
290
+ },
291
+ {
292
+ "epoch": 0.690978886756238,
293
+ "grad_norm": 0.3813941478729248,
294
+ "learning_rate": 4.92735454356513e-05,
295
+ "loss": 0.1378,
296
+ "num_input_tokens_seen": 34140160,
297
+ "step": 180
298
+ },
299
+ {
300
+ "epoch": 0.710172744721689,
301
+ "grad_norm": 0.3822474777698517,
302
+ "learning_rate": 4.923283685154231e-05,
303
+ "loss": 0.1417,
304
+ "num_input_tokens_seen": 35078528,
305
+ "step": 185
306
+ },
307
+ {
308
+ "epoch": 0.7293666026871402,
309
+ "grad_norm": 0.4150351583957672,
310
+ "learning_rate": 4.9191036297534454e-05,
311
+ "loss": 0.1476,
312
+ "num_input_tokens_seen": 36013696,
313
+ "step": 190
314
+ },
315
+ {
316
+ "epoch": 0.7485604606525912,
317
+ "grad_norm": 0.3787771463394165,
318
+ "learning_rate": 4.914814565722671e-05,
319
+ "loss": 0.1414,
320
+ "num_input_tokens_seen": 36973056,
321
+ "step": 195
322
+ },
323
+ {
324
+ "epoch": 0.7677543186180422,
325
+ "grad_norm": 0.4316699504852295,
326
+ "learning_rate": 4.910416686333906e-05,
327
+ "loss": 0.1447,
328
+ "num_input_tokens_seen": 37904256,
329
+ "step": 200
330
+ },
331
+ {
332
+ "epoch": 0.7869481765834933,
333
+ "grad_norm": 0.38349735736846924,
334
+ "learning_rate": 4.905910189762542e-05,
335
+ "loss": 0.1417,
336
+ "num_input_tokens_seen": 38858240,
337
+ "step": 205
338
+ },
339
+ {
340
+ "epoch": 0.8061420345489443,
341
+ "grad_norm": 0.41351762413978577,
342
+ "learning_rate": 4.901295279078431e-05,
343
+ "loss": 0.1435,
344
+ "num_input_tokens_seen": 39800192,
345
+ "step": 210
346
+ },
347
+ {
348
+ "epoch": 0.8253358925143954,
349
+ "grad_norm": 0.36677101254463196,
350
+ "learning_rate": 4.896572162236737e-05,
351
+ "loss": 0.1454,
352
+ "num_input_tokens_seen": 40736128,
353
+ "step": 215
354
+ },
355
+ {
356
+ "epoch": 0.8445297504798465,
357
+ "grad_norm": 0.4057168662548065,
358
+ "learning_rate": 4.8917410520685635e-05,
359
+ "loss": 0.1414,
360
+ "num_input_tokens_seen": 41699072,
361
+ "step": 220
362
+ },
363
+ {
364
+ "epoch": 0.8637236084452975,
365
+ "grad_norm": 0.3790329396724701,
366
+ "learning_rate": 4.886802166271364e-05,
367
+ "loss": 0.1452,
368
+ "num_input_tokens_seen": 42651648,
369
+ "step": 225
370
+ },
371
+ {
372
+ "epoch": 0.8829174664107485,
373
+ "grad_norm": 0.37877827882766724,
374
+ "learning_rate": 4.881755727399134e-05,
375
+ "loss": 0.1408,
376
+ "num_input_tokens_seen": 43584512,
377
+ "step": 230
378
+ },
379
+ {
380
+ "epoch": 0.9021113243761996,
381
+ "grad_norm": 0.37050819396972656,
382
+ "learning_rate": 4.8766019628523775e-05,
383
+ "loss": 0.1342,
384
+ "num_input_tokens_seen": 44552448,
385
+ "step": 235
386
+ },
387
+ {
388
+ "epoch": 0.9213051823416507,
389
+ "grad_norm": 0.367374449968338,
390
+ "learning_rate": 4.8713411048678635e-05,
391
+ "loss": 0.142,
392
+ "num_input_tokens_seen": 45498368,
393
+ "step": 240
394
+ },
395
+ {
396
+ "epoch": 0.9404990403071017,
397
+ "grad_norm": 0.3566618859767914,
398
+ "learning_rate": 4.8659733905081634e-05,
399
+ "loss": 0.1398,
400
+ "num_input_tokens_seen": 46445952,
401
+ "step": 245
402
+ },
403
+ {
404
+ "epoch": 0.9596928982725528,
405
+ "grad_norm": 0.35429617762565613,
406
+ "learning_rate": 4.8604990616509616e-05,
407
+ "loss": 0.1345,
408
+ "num_input_tokens_seen": 47391872,
409
+ "step": 250
410
+ },
411
+ {
412
+ "epoch": 0.9788867562380038,
413
+ "grad_norm": 0.3834087550640106,
414
+ "learning_rate": 4.8549183649781626e-05,
415
+ "loss": 0.14,
416
+ "num_input_tokens_seen": 48339584,
417
+ "step": 255
418
+ },
419
+ {
420
+ "epoch": 0.9980806142034548,
421
+ "grad_norm": 0.36162251234054565,
422
+ "learning_rate": 4.849231551964771e-05,
423
+ "loss": 0.1376,
424
+ "num_input_tokens_seen": 49294848,
425
+ "step": 260
426
+ },
427
+ {
428
+ "epoch": 1.017274472168906,
429
+ "grad_norm": 0.3323320746421814,
430
+ "learning_rate": 4.8434388788675635e-05,
431
+ "loss": 0.1149,
432
+ "num_input_tokens_seen": 50247296,
433
+ "step": 265
434
+ },
435
+ {
436
+ "epoch": 1.036468330134357,
437
+ "grad_norm": 0.36334532499313354,
438
+ "learning_rate": 4.837540606713538e-05,
439
+ "loss": 0.1119,
440
+ "num_input_tokens_seen": 51193472,
441
+ "step": 270
442
+ },
443
+ {
444
+ "epoch": 1.055662188099808,
445
+ "grad_norm": 0.37446385622024536,
446
+ "learning_rate": 4.8315370012881514e-05,
447
+ "loss": 0.1125,
448
+ "num_input_tokens_seen": 52144000,
449
+ "step": 275
450
+ },
451
+ {
452
+ "epoch": 1.0748560460652592,
453
+ "grad_norm": 0.4256220757961273,
454
+ "learning_rate": 4.8254283331233464e-05,
455
+ "loss": 0.1083,
456
+ "num_input_tokens_seen": 53097216,
457
+ "step": 280
458
+ },
459
+ {
460
+ "epoch": 1.0940499040307101,
461
+ "grad_norm": 0.39395830035209656,
462
+ "learning_rate": 4.819214877485358e-05,
463
+ "loss": 0.1083,
464
+ "num_input_tokens_seen": 54032896,
465
+ "step": 285
466
+ },
467
+ {
468
+ "epoch": 1.1132437619961613,
469
+ "grad_norm": 0.3702699542045593,
470
+ "learning_rate": 4.812896914362309e-05,
471
+ "loss": 0.1092,
472
+ "num_input_tokens_seen": 54988800,
473
+ "step": 290
474
+ },
475
+ {
476
+ "epoch": 1.1324376199616122,
477
+ "grad_norm": 0.389553040266037,
478
+ "learning_rate": 4.806474728451597e-05,
479
+ "loss": 0.1057,
480
+ "num_input_tokens_seen": 55947520,
481
+ "step": 295
482
+ },
483
+ {
484
+ "epoch": 1.1516314779270633,
485
+ "grad_norm": 0.37560170888900757,
486
+ "learning_rate": 4.799948609147061e-05,
487
+ "loss": 0.108,
488
+ "num_input_tokens_seen": 56864384,
489
+ "step": 300
490
+ },
491
+ {
492
+ "epoch": 1.1708253358925145,
493
+ "grad_norm": 0.3832947611808777,
494
+ "learning_rate": 4.793318850525943e-05,
495
+ "loss": 0.1092,
496
+ "num_input_tokens_seen": 57816960,
497
+ "step": 305
498
+ },
499
+ {
500
+ "epoch": 1.1900191938579654,
501
+ "grad_norm": 0.41275399923324585,
502
+ "learning_rate": 4.786585751335637e-05,
503
+ "loss": 0.1076,
504
+ "num_input_tokens_seen": 58773376,
505
+ "step": 310
506
+ },
507
+ {
508
+ "epoch": 1.2092130518234165,
509
+ "grad_norm": 0.4026075601577759,
510
+ "learning_rate": 4.7797496149802256e-05,
511
+ "loss": 0.1061,
512
+ "num_input_tokens_seen": 59710592,
513
+ "step": 315
514
+ },
515
+ {
516
+ "epoch": 1.2284069097888675,
517
+ "grad_norm": 0.4040988087654114,
518
+ "learning_rate": 4.77281074950681e-05,
519
+ "loss": 0.1073,
520
+ "num_input_tokens_seen": 60660352,
521
+ "step": 320
522
+ },
523
+ {
524
+ "epoch": 1.2476007677543186,
525
+ "grad_norm": 0.40489524602890015,
526
+ "learning_rate": 4.765769467591625e-05,
527
+ "loss": 0.1085,
528
+ "num_input_tokens_seen": 61600256,
529
+ "step": 325
530
+ },
531
+ {
532
+ "epoch": 1.2667946257197698,
533
+ "grad_norm": 0.40675088763237,
534
+ "learning_rate": 4.758626086525956e-05,
535
+ "loss": 0.1149,
536
+ "num_input_tokens_seen": 62558720,
537
+ "step": 330
538
+ },
539
+ {
540
+ "epoch": 1.2859884836852207,
541
+ "grad_norm": 0.4014970362186432,
542
+ "learning_rate": 4.751380928201834e-05,
543
+ "loss": 0.1101,
544
+ "num_input_tokens_seen": 63505152,
545
+ "step": 335
546
+ },
547
+ {
548
+ "epoch": 1.3051823416506718,
549
+ "grad_norm": 0.3987894654273987,
550
+ "learning_rate": 4.744034319097535e-05,
551
+ "loss": 0.1113,
552
+ "num_input_tokens_seen": 64475392,
553
+ "step": 340
554
+ },
555
+ {
556
+ "epoch": 1.3243761996161227,
557
+ "grad_norm": 0.3988923728466034,
558
+ "learning_rate": 4.7365865902628684e-05,
559
+ "loss": 0.1065,
560
+ "num_input_tokens_seen": 65412352,
561
+ "step": 345
562
+ },
563
+ {
564
+ "epoch": 1.3435700575815739,
565
+ "grad_norm": 0.40686464309692383,
566
+ "learning_rate": 4.7290380773042575e-05,
567
+ "loss": 0.1058,
568
+ "num_input_tokens_seen": 66358272,
569
+ "step": 350
570
+ },
571
+ {
572
+ "epoch": 1.362763915547025,
573
+ "grad_norm": 0.4416196048259735,
574
+ "learning_rate": 4.7213891203696164e-05,
575
+ "loss": 0.1068,
576
+ "num_input_tokens_seen": 67310208,
577
+ "step": 355
578
+ },
579
+ {
580
+ "epoch": 1.381957773512476,
581
+ "grad_norm": 0.38802987337112427,
582
+ "learning_rate": 4.713640064133025e-05,
583
+ "loss": 0.1119,
584
+ "num_input_tokens_seen": 68256256,
585
+ "step": 360
586
+ },
587
+ {
588
+ "epoch": 1.401151631477927,
589
+ "grad_norm": 0.37979230284690857,
590
+ "learning_rate": 4.705791257779195e-05,
591
+ "loss": 0.1087,
592
+ "num_input_tokens_seen": 69200896,
593
+ "step": 365
594
+ },
595
+ {
596
+ "epoch": 1.420345489443378,
597
+ "grad_norm": 0.40544721484184265,
598
+ "learning_rate": 4.697843054987737e-05,
599
+ "loss": 0.1101,
600
+ "num_input_tokens_seen": 70129792,
601
+ "step": 370
602
+ },
603
+ {
604
+ "epoch": 1.4395393474088292,
605
+ "grad_norm": 0.39788997173309326,
606
+ "learning_rate": 4.68979581391722e-05,
607
+ "loss": 0.1117,
608
+ "num_input_tokens_seen": 71074944,
609
+ "step": 375
610
+ },
611
+ {
612
+ "epoch": 1.4587332053742803,
613
+ "grad_norm": 0.40104883909225464,
614
+ "learning_rate": 4.681649897189036e-05,
615
+ "loss": 0.1067,
616
+ "num_input_tokens_seen": 72009088,
617
+ "step": 380
618
+ },
619
+ {
620
+ "epoch": 1.4779270633397312,
621
+ "grad_norm": 0.4051682949066162,
622
+ "learning_rate": 4.673405671871057e-05,
623
+ "loss": 0.1106,
624
+ "num_input_tokens_seen": 72954880,
625
+ "step": 385
626
+ },
627
+ {
628
+ "epoch": 1.4971209213051824,
629
+ "grad_norm": 0.40561723709106445,
630
+ "learning_rate": 4.665063509461097e-05,
631
+ "loss": 0.108,
632
+ "num_input_tokens_seen": 73890176,
633
+ "step": 390
634
+ },
635
+ {
636
+ "epoch": 1.5163147792706333,
637
+ "grad_norm": 0.4338354468345642,
638
+ "learning_rate": 4.656623785870167e-05,
639
+ "loss": 0.1093,
640
+ "num_input_tokens_seen": 74851840,
641
+ "step": 395
642
+ },
643
+ {
644
+ "epoch": 1.5355086372360844,
645
+ "grad_norm": 0.40794938802719116,
646
+ "learning_rate": 4.6480868814055424e-05,
647
+ "loss": 0.1071,
648
+ "num_input_tokens_seen": 75794816,
649
+ "step": 400
650
+ },
651
+ {
652
+ "epoch": 1.5547024952015356,
653
+ "grad_norm": 0.406972199678421,
654
+ "learning_rate": 4.639453180753619e-05,
655
+ "loss": 0.1092,
656
+ "num_input_tokens_seen": 76736512,
657
+ "step": 405
658
+ },
659
+ {
660
+ "epoch": 1.5738963531669867,
661
+ "grad_norm": 0.4144577383995056,
662
+ "learning_rate": 4.630723072962584e-05,
663
+ "loss": 0.1083,
664
+ "num_input_tokens_seen": 77692288,
665
+ "step": 410
666
+ },
667
+ {
668
+ "epoch": 1.5930902111324377,
669
+ "grad_norm": 0.44782555103302,
670
+ "learning_rate": 4.6218969514248814e-05,
671
+ "loss": 0.1068,
672
+ "num_input_tokens_seen": 78654720,
673
+ "step": 415
674
+ },
675
+ {
676
+ "epoch": 1.6122840690978886,
677
+ "grad_norm": 0.45688125491142273,
678
+ "learning_rate": 4.6129752138594874e-05,
679
+ "loss": 0.1083,
680
+ "num_input_tokens_seen": 79607552,
681
+ "step": 420
682
+ },
683
+ {
684
+ "epoch": 1.6314779270633397,
685
+ "grad_norm": 0.43751007318496704,
686
+ "learning_rate": 4.6039582622939854e-05,
687
+ "loss": 0.1087,
688
+ "num_input_tokens_seen": 80547328,
689
+ "step": 425
690
+ },
691
+ {
692
+ "epoch": 1.6506717850287909,
693
+ "grad_norm": 0.42292338609695435,
694
+ "learning_rate": 4.5948465030464536e-05,
695
+ "loss": 0.107,
696
+ "num_input_tokens_seen": 81478400,
697
+ "step": 430
698
+ },
699
+ {
700
+ "epoch": 1.669865642994242,
701
+ "grad_norm": 0.4191446006298065,
702
+ "learning_rate": 4.5856403467071536e-05,
703
+ "loss": 0.1084,
704
+ "num_input_tokens_seen": 82441984,
705
+ "step": 435
706
+ },
707
+ {
708
+ "epoch": 1.689059500959693,
709
+ "grad_norm": 0.397989958524704,
710
+ "learning_rate": 4.5763402081200294e-05,
711
+ "loss": 0.1069,
712
+ "num_input_tokens_seen": 83382912,
713
+ "step": 440
714
+ },
715
+ {
716
+ "epoch": 1.7082533589251438,
717
+ "grad_norm": 0.4078799784183502,
718
+ "learning_rate": 4.566946506364013e-05,
719
+ "loss": 0.1064,
720
+ "num_input_tokens_seen": 84332672,
721
+ "step": 445
722
+ },
723
+ {
724
+ "epoch": 1.727447216890595,
725
+ "grad_norm": 0.41216278076171875,
726
+ "learning_rate": 4.557459664734141e-05,
727
+ "loss": 0.1114,
728
+ "num_input_tokens_seen": 85280128,
729
+ "step": 450
730
+ },
731
+ {
732
+ "epoch": 1.7466410748560461,
733
+ "grad_norm": 0.43625858426094055,
734
+ "learning_rate": 4.54788011072248e-05,
735
+ "loss": 0.1069,
736
+ "num_input_tokens_seen": 86221568,
737
+ "step": 455
738
+ },
739
+ {
740
+ "epoch": 1.7658349328214973,
741
+ "grad_norm": 0.3984062075614929,
742
+ "learning_rate": 4.538208275998861e-05,
743
+ "loss": 0.107,
744
+ "num_input_tokens_seen": 87159936,
745
+ "step": 460
746
+ },
747
+ {
748
+ "epoch": 1.7850287907869482,
749
+ "grad_norm": 0.4033821225166321,
750
+ "learning_rate": 4.528444596391433e-05,
751
+ "loss": 0.1066,
752
+ "num_input_tokens_seen": 88115712,
753
+ "step": 465
754
+ },
755
+ {
756
+ "epoch": 1.8042226487523991,
757
+ "grad_norm": 0.40416958928108215,
758
+ "learning_rate": 4.518589511867017e-05,
759
+ "loss": 0.1054,
760
+ "num_input_tokens_seen": 89069824,
761
+ "step": 470
762
+ },
763
+ {
764
+ "epoch": 1.8234165067178503,
765
+ "grad_norm": 0.4133840501308441,
766
+ "learning_rate": 4.5086434665112864e-05,
767
+ "loss": 0.1055,
768
+ "num_input_tokens_seen": 90018176,
769
+ "step": 475
770
+ },
771
+ {
772
+ "epoch": 1.8426103646833014,
773
+ "grad_norm": 0.44508275389671326,
774
+ "learning_rate": 4.498606908508754e-05,
775
+ "loss": 0.1091,
776
+ "num_input_tokens_seen": 90960128,
777
+ "step": 480
778
+ },
779
+ {
780
+ "epoch": 1.8618042226487526,
781
+ "grad_norm": 0.4138317108154297,
782
+ "learning_rate": 4.4884802901225695e-05,
783
+ "loss": 0.106,
784
+ "num_input_tokens_seen": 91911936,
785
+ "step": 485
786
+ },
787
+ {
788
+ "epoch": 1.8809980806142035,
789
+ "grad_norm": 0.41188734769821167,
790
+ "learning_rate": 4.478264067674155e-05,
791
+ "loss": 0.106,
792
+ "num_input_tokens_seen": 92849280,
793
+ "step": 490
794
+ },
795
+ {
796
+ "epoch": 1.9001919385796544,
797
+ "grad_norm": 0.43868768215179443,
798
+ "learning_rate": 4.4679587015226253e-05,
799
+ "loss": 0.1065,
800
+ "num_input_tokens_seen": 93810560,
801
+ "step": 495
802
+ },
803
+ {
804
+ "epoch": 1.9193857965451055,
805
+ "grad_norm": 0.4238053560256958,
806
+ "learning_rate": 4.457564656044056e-05,
807
+ "loss": 0.1036,
808
+ "num_input_tokens_seen": 94771328,
809
+ "step": 500
810
+ },
811
+ {
812
+ "epoch": 1.9385796545105567,
813
+ "grad_norm": 0.4343940317630768,
814
+ "learning_rate": 4.447082399610549e-05,
815
+ "loss": 0.1039,
816
+ "num_input_tokens_seen": 95733632,
817
+ "step": 505
818
+ },
819
+ {
820
+ "epoch": 1.9577735124760078,
821
+ "grad_norm": 0.4396176040172577,
822
+ "learning_rate": 4.436512404569136e-05,
823
+ "loss": 0.1073,
824
+ "num_input_tokens_seen": 96689280,
825
+ "step": 510
826
+ },
827
+ {
828
+ "epoch": 1.9769673704414588,
829
+ "grad_norm": 0.4687812030315399,
830
+ "learning_rate": 4.4258551472204865e-05,
831
+ "loss": 0.1082,
832
+ "num_input_tokens_seen": 97626112,
833
+ "step": 515
834
+ },
835
+ {
836
+ "epoch": 1.9961612284069097,
837
+ "grad_norm": 0.4092023968696594,
838
+ "learning_rate": 4.415111107797445e-05,
839
+ "loss": 0.1049,
840
+ "num_input_tokens_seen": 98561280,
841
+ "step": 520
842
+ },
843
+ {
844
+ "epoch": 2.015355086372361,
845
+ "grad_norm": 0.3592652976512909,
846
+ "learning_rate": 4.404280770443398e-05,
847
+ "loss": 0.0829,
848
+ "num_input_tokens_seen": 99516160,
849
+ "step": 525
850
+ },
851
+ {
852
+ "epoch": 2.034548944337812,
853
+ "grad_norm": 0.4278601109981537,
854
+ "learning_rate": 4.3933646231904504e-05,
855
+ "loss": 0.0727,
856
+ "num_input_tokens_seen": 100466048,
857
+ "step": 530
858
+ },
859
+ {
860
+ "epoch": 2.053742802303263,
861
+ "grad_norm": 0.48623570799827576,
862
+ "learning_rate": 4.3823631579374354e-05,
863
+ "loss": 0.0706,
864
+ "num_input_tokens_seen": 101422208,
865
+ "step": 535
866
+ },
867
+ {
868
+ "epoch": 2.072936660268714,
869
+ "grad_norm": 0.4464566111564636,
870
+ "learning_rate": 4.371276870427753e-05,
871
+ "loss": 0.0683,
872
+ "num_input_tokens_seen": 102365568,
873
+ "step": 540
874
+ },
875
+ {
876
+ "epoch": 2.092130518234165,
877
+ "grad_norm": 0.48030439019203186,
878
+ "learning_rate": 4.360106260227027e-05,
879
+ "loss": 0.0708,
880
+ "num_input_tokens_seen": 103312128,
881
+ "step": 545
882
+ },
883
+ {
884
+ "epoch": 2.111324376199616,
885
+ "grad_norm": 0.4528995752334595,
886
+ "learning_rate": 4.348851830700593e-05,
887
+ "loss": 0.0693,
888
+ "num_input_tokens_seen": 104258560,
889
+ "step": 550
890
+ },
891
+ {
892
+ "epoch": 2.1305182341650672,
893
+ "grad_norm": 0.48248499631881714,
894
+ "learning_rate": 4.337514088990822e-05,
895
+ "loss": 0.067,
896
+ "num_input_tokens_seen": 105199360,
897
+ "step": 555
898
+ },
899
+ {
900
+ "epoch": 2.1497120921305184,
901
+ "grad_norm": 0.5198423266410828,
902
+ "learning_rate": 4.3260935459942584e-05,
903
+ "loss": 0.0701,
904
+ "num_input_tokens_seen": 106155776,
905
+ "step": 560
906
+ },
907
+ {
908
+ "epoch": 2.168905950095969,
909
+ "grad_norm": 0.47610777616500854,
910
+ "learning_rate": 4.3145907163386064e-05,
911
+ "loss": 0.0673,
912
+ "num_input_tokens_seen": 107121792,
913
+ "step": 565
914
+ },
915
+ {
916
+ "epoch": 2.1880998080614202,
917
+ "grad_norm": 0.4852938950061798,
918
+ "learning_rate": 4.303006118359537e-05,
919
+ "loss": 0.0667,
920
+ "num_input_tokens_seen": 108049280,
921
+ "step": 570
922
+ },
923
+ {
924
+ "epoch": 2.2072936660268714,
925
+ "grad_norm": 0.5004666447639465,
926
+ "learning_rate": 4.2913402740773294e-05,
927
+ "loss": 0.0708,
928
+ "num_input_tokens_seen": 108994816,
929
+ "step": 575
930
+ },
931
+ {
932
+ "epoch": 2.2264875239923225,
933
+ "grad_norm": 0.500302255153656,
934
+ "learning_rate": 4.2795937091733515e-05,
935
+ "loss": 0.073,
936
+ "num_input_tokens_seen": 109929728,
937
+ "step": 580
938
+ },
939
+ {
940
+ "epoch": 2.2456813819577737,
941
+ "grad_norm": 0.45096588134765625,
942
+ "learning_rate": 4.267766952966369e-05,
943
+ "loss": 0.0712,
944
+ "num_input_tokens_seen": 110867328,
945
+ "step": 585
946
+ },
947
+ {
948
+ "epoch": 2.2648752399232244,
949
+ "grad_norm": 0.48498988151550293,
950
+ "learning_rate": 4.255860538388694e-05,
951
+ "loss": 0.0703,
952
+ "num_input_tokens_seen": 111824640,
953
+ "step": 590
954
+ },
955
+ {
956
+ "epoch": 2.2840690978886755,
957
+ "grad_norm": 0.6077526807785034,
958
+ "learning_rate": 4.24387500196217e-05,
959
+ "loss": 0.0719,
960
+ "num_input_tokens_seen": 112779648,
961
+ "step": 595
962
+ },
963
+ {
964
+ "epoch": 2.3032629558541267,
965
+ "grad_norm": 0.5256830453872681,
966
+ "learning_rate": 4.231810883773999e-05,
967
+ "loss": 0.0693,
968
+ "num_input_tokens_seen": 113709312,
969
+ "step": 600
970
+ },
971
+ {
972
+ "epoch": 2.322456813819578,
973
+ "grad_norm": 0.5583354234695435,
974
+ "learning_rate": 4.219668727452397e-05,
975
+ "loss": 0.0754,
976
+ "num_input_tokens_seen": 114654080,
977
+ "step": 605
978
+ },
979
+ {
980
+ "epoch": 2.341650671785029,
981
+ "grad_norm": 0.47424212098121643,
982
+ "learning_rate": 4.207449080142104e-05,
983
+ "loss": 0.0692,
984
+ "num_input_tokens_seen": 115618176,
985
+ "step": 610
986
+ },
987
+ {
988
+ "epoch": 2.36084452975048,
989
+ "grad_norm": 0.4865480959415436,
990
+ "learning_rate": 4.195152492479727e-05,
991
+ "loss": 0.0734,
992
+ "num_input_tokens_seen": 116583168,
993
+ "step": 615
994
+ },
995
+ {
996
+ "epoch": 2.380038387715931,
997
+ "grad_norm": 0.5127935409545898,
998
+ "learning_rate": 4.182779518568926e-05,
999
+ "loss": 0.0756,
1000
+ "num_input_tokens_seen": 117525632,
1001
+ "step": 620
1002
+ },
1003
+ {
1004
+ "epoch": 2.399232245681382,
1005
+ "grad_norm": 0.48488515615463257,
1006
+ "learning_rate": 4.170330715955444e-05,
1007
+ "loss": 0.0721,
1008
+ "num_input_tokens_seen": 118487040,
1009
+ "step": 625
1010
+ },
1011
+ {
1012
+ "epoch": 2.418426103646833,
1013
+ "grad_norm": 0.5220322012901306,
1014
+ "learning_rate": 4.157806645601988e-05,
1015
+ "loss": 0.0708,
1016
+ "num_input_tokens_seen": 119424384,
1017
+ "step": 630
1018
+ },
1019
+ {
1020
+ "epoch": 2.4376199616122842,
1021
+ "grad_norm": 0.4799134433269501,
1022
+ "learning_rate": 4.145207871862947e-05,
1023
+ "loss": 0.071,
1024
+ "num_input_tokens_seen": 120394496,
1025
+ "step": 635
1026
+ },
1027
+ {
1028
+ "epoch": 2.456813819577735,
1029
+ "grad_norm": 0.5201560258865356,
1030
+ "learning_rate": 4.132534962458962e-05,
1031
+ "loss": 0.0733,
1032
+ "num_input_tokens_seen": 121340160,
1033
+ "step": 640
1034
+ },
1035
+ {
1036
+ "epoch": 2.476007677543186,
1037
+ "grad_norm": 0.46850621700286865,
1038
+ "learning_rate": 4.1197884884513474e-05,
1039
+ "loss": 0.0725,
1040
+ "num_input_tokens_seen": 122270720,
1041
+ "step": 645
1042
+ },
1043
+ {
1044
+ "epoch": 2.495201535508637,
1045
+ "grad_norm": 0.5046434998512268,
1046
+ "learning_rate": 4.1069690242163484e-05,
1047
+ "loss": 0.0723,
1048
+ "num_input_tokens_seen": 123226880,
1049
+ "step": 650
1050
+ },
1051
+ {
1052
+ "epoch": 2.5143953934740884,
1053
+ "grad_norm": 0.4388461410999298,
1054
+ "learning_rate": 4.094077147419271e-05,
1055
+ "loss": 0.0702,
1056
+ "num_input_tokens_seen": 124177664,
1057
+ "step": 655
1058
+ },
1059
+ {
1060
+ "epoch": 2.5335892514395395,
1061
+ "grad_norm": 0.5357294678688049,
1062
+ "learning_rate": 4.0811134389884433e-05,
1063
+ "loss": 0.0716,
1064
+ "num_input_tokens_seen": 125131520,
1065
+ "step": 660
1066
+ },
1067
+ {
1068
+ "epoch": 2.5527831094049906,
1069
+ "grad_norm": 0.56220543384552,
1070
+ "learning_rate": 4.0680784830890405e-05,
1071
+ "loss": 0.0719,
1072
+ "num_input_tokens_seen": 126085248,
1073
+ "step": 665
1074
+ },
1075
+ {
1076
+ "epoch": 2.5719769673704413,
1077
+ "grad_norm": 0.4952705204486847,
1078
+ "learning_rate": 4.05497286709676e-05,
1079
+ "loss": 0.0705,
1080
+ "num_input_tokens_seen": 127024128,
1081
+ "step": 670
1082
+ },
1083
+ {
1084
+ "epoch": 2.5911708253358925,
1085
+ "grad_norm": 0.4931686520576477,
1086
+ "learning_rate": 4.0417971815713584e-05,
1087
+ "loss": 0.0705,
1088
+ "num_input_tokens_seen": 127971072,
1089
+ "step": 675
1090
+ },
1091
+ {
1092
+ "epoch": 2.6103646833013436,
1093
+ "grad_norm": 0.5248854756355286,
1094
+ "learning_rate": 4.028552020230031e-05,
1095
+ "loss": 0.0741,
1096
+ "num_input_tokens_seen": 128914176,
1097
+ "step": 680
1098
+ },
1099
+ {
1100
+ "epoch": 2.629558541266795,
1101
+ "grad_norm": 0.48179954290390015,
1102
+ "learning_rate": 4.015237979920666e-05,
1103
+ "loss": 0.0722,
1104
+ "num_input_tokens_seen": 129855360,
1105
+ "step": 685
1106
+ },
1107
+ {
1108
+ "epoch": 2.6487523992322455,
1109
+ "grad_norm": 0.5252842903137207,
1110
+ "learning_rate": 4.001855660594948e-05,
1111
+ "loss": 0.0746,
1112
+ "num_input_tokens_seen": 130808960,
1113
+ "step": 690
1114
+ },
1115
+ {
1116
+ "epoch": 2.6679462571976966,
1117
+ "grad_norm": 0.48107513785362244,
1118
+ "learning_rate": 3.9884056652813184e-05,
1119
+ "loss": 0.0692,
1120
+ "num_input_tokens_seen": 131750144,
1121
+ "step": 695
1122
+ },
1123
+ {
1124
+ "epoch": 2.6871401151631478,
1125
+ "grad_norm": 0.6338366866111755,
1126
+ "learning_rate": 3.974888600057808e-05,
1127
+ "loss": 0.0713,
1128
+ "num_input_tokens_seen": 132695936,
1129
+ "step": 700
1130
+ },
1131
+ {
1132
+ "epoch": 2.706333973128599,
1133
+ "grad_norm": 0.4953431785106659,
1134
+ "learning_rate": 3.9613050740247224e-05,
1135
+ "loss": 0.0711,
1136
+ "num_input_tokens_seen": 133666944,
1137
+ "step": 705
1138
+ },
1139
+ {
1140
+ "epoch": 2.72552783109405,
1141
+ "grad_norm": 0.4697812497615814,
1142
+ "learning_rate": 3.947655699277197e-05,
1143
+ "loss": 0.0677,
1144
+ "num_input_tokens_seen": 134611456,
1145
+ "step": 710
1146
+ },
1147
+ {
1148
+ "epoch": 2.744721689059501,
1149
+ "grad_norm": 0.5141253471374512,
1150
+ "learning_rate": 3.933941090877615e-05,
1151
+ "loss": 0.0742,
1152
+ "num_input_tokens_seen": 135558400,
1153
+ "step": 715
1154
+ },
1155
+ {
1156
+ "epoch": 2.763915547024952,
1157
+ "grad_norm": 0.4803192913532257,
1158
+ "learning_rate": 3.920161866827889e-05,
1159
+ "loss": 0.07,
1160
+ "num_input_tokens_seen": 136509696,
1161
+ "step": 720
1162
+ },
1163
+ {
1164
+ "epoch": 2.783109404990403,
1165
+ "grad_norm": 0.47573211789131165,
1166
+ "learning_rate": 3.906318648041617e-05,
1167
+ "loss": 0.0725,
1168
+ "num_input_tokens_seen": 137442816,
1169
+ "step": 725
1170
+ },
1171
+ {
1172
+ "epoch": 2.802303262955854,
1173
+ "grad_norm": 0.5205782055854797,
1174
+ "learning_rate": 3.8924120583160985e-05,
1175
+ "loss": 0.0704,
1176
+ "num_input_tokens_seen": 138383232,
1177
+ "step": 730
1178
+ },
1179
+ {
1180
+ "epoch": 2.8214971209213053,
1181
+ "grad_norm": 0.5315016508102417,
1182
+ "learning_rate": 3.8784427243042296e-05,
1183
+ "loss": 0.0721,
1184
+ "num_input_tokens_seen": 139326464,
1185
+ "step": 735
1186
+ },
1187
+ {
1188
+ "epoch": 2.840690978886756,
1189
+ "grad_norm": 0.4982779324054718,
1190
+ "learning_rate": 3.8644112754862614e-05,
1191
+ "loss": 0.0702,
1192
+ "num_input_tokens_seen": 140295296,
1193
+ "step": 740
1194
+ },
1195
+ {
1196
+ "epoch": 2.859884836852207,
1197
+ "grad_norm": 0.550413191318512,
1198
+ "learning_rate": 3.850318344141439e-05,
1199
+ "loss": 0.0719,
1200
+ "num_input_tokens_seen": 141247488,
1201
+ "step": 745
1202
+ },
1203
+ {
1204
+ "epoch": 2.8790786948176583,
1205
+ "grad_norm": 0.5022615194320679,
1206
+ "learning_rate": 3.8361645653195026e-05,
1207
+ "loss": 0.0712,
1208
+ "num_input_tokens_seen": 142186240,
1209
+ "step": 750
1210
+ },
1211
+ {
1212
+ "epoch": 2.8982725527831095,
1213
+ "grad_norm": 0.5040895342826843,
1214
+ "learning_rate": 3.821950576812081e-05,
1215
+ "loss": 0.0717,
1216
+ "num_input_tokens_seen": 143130624,
1217
+ "step": 755
1218
+ },
1219
+ {
1220
+ "epoch": 2.9174664107485606,
1221
+ "grad_norm": 0.5027932524681091,
1222
+ "learning_rate": 3.807677019123944e-05,
1223
+ "loss": 0.0717,
1224
+ "num_input_tokens_seen": 144061568,
1225
+ "step": 760
1226
+ },
1227
+ {
1228
+ "epoch": 2.9366602687140118,
1229
+ "grad_norm": 0.5451627969741821,
1230
+ "learning_rate": 3.793344535444142e-05,
1231
+ "loss": 0.0694,
1232
+ "num_input_tokens_seen": 145013632,
1233
+ "step": 765
1234
+ },
1235
+ {
1236
+ "epoch": 2.9558541266794625,
1237
+ "grad_norm": 0.5232383012771606,
1238
+ "learning_rate": 3.7789537716170256e-05,
1239
+ "loss": 0.07,
1240
+ "num_input_tokens_seen": 145962880,
1241
+ "step": 770
1242
+ },
1243
+ {
1244
+ "epoch": 2.9750479846449136,
1245
+ "grad_norm": 0.46057307720184326,
1246
+ "learning_rate": 3.764505376113138e-05,
1247
+ "loss": 0.0699,
1248
+ "num_input_tokens_seen": 146916480,
1249
+ "step": 775
1250
+ },
1251
+ {
1252
+ "epoch": 2.9942418426103647,
1253
+ "grad_norm": 0.510372519493103,
1254
+ "learning_rate": 3.7500000000000003e-05,
1255
+ "loss": 0.0705,
1256
+ "num_input_tokens_seen": 147865344,
1257
+ "step": 780
1258
+ },
1259
+ {
1260
+ "epoch": 3.013435700575816,
1261
+ "grad_norm": 0.3846762180328369,
1262
+ "learning_rate": 3.735438296912768e-05,
1263
+ "loss": 0.0515,
1264
+ "num_input_tokens_seen": 148817664,
1265
+ "step": 785
1266
+ },
1267
+ {
1268
+ "epoch": 3.0326295585412666,
1269
+ "grad_norm": 0.5160824656486511,
1270
+ "learning_rate": 3.720820923024778e-05,
1271
+ "loss": 0.0398,
1272
+ "num_input_tokens_seen": 149770880,
1273
+ "step": 790
1274
+ },
1275
+ {
1276
+ "epoch": 3.0518234165067177,
1277
+ "grad_norm": 0.5621275305747986,
1278
+ "learning_rate": 3.7061485370179835e-05,
1279
+ "loss": 0.0388,
1280
+ "num_input_tokens_seen": 150717312,
1281
+ "step": 795
1282
+ },
1283
+ {
1284
+ "epoch": 3.071017274472169,
1285
+ "grad_norm": 0.5459131002426147,
1286
+ "learning_rate": 3.69142180005327e-05,
1287
+ "loss": 0.0392,
1288
+ "num_input_tokens_seen": 151666432,
1289
+ "step": 800
1290
+ },
1291
+ {
1292
+ "epoch": 3.09021113243762,
1293
+ "grad_norm": 0.5296151638031006,
1294
+ "learning_rate": 3.676641375740662e-05,
1295
+ "loss": 0.0373,
1296
+ "num_input_tokens_seen": 152584064,
1297
+ "step": 805
1298
+ },
1299
+ {
1300
+ "epoch": 3.109404990403071,
1301
+ "grad_norm": 0.4850353002548218,
1302
+ "learning_rate": 3.6618079301094216e-05,
1303
+ "loss": 0.039,
1304
+ "num_input_tokens_seen": 153525248,
1305
+ "step": 810
1306
+ },
1307
+ {
1308
+ "epoch": 3.128598848368522,
1309
+ "grad_norm": 0.5585963726043701,
1310
+ "learning_rate": 3.646922131578036e-05,
1311
+ "loss": 0.038,
1312
+ "num_input_tokens_seen": 154456704,
1313
+ "step": 815
1314
+ },
1315
+ {
1316
+ "epoch": 3.147792706333973,
1317
+ "grad_norm": 0.5561864972114563,
1318
+ "learning_rate": 3.631984650924094e-05,
1319
+ "loss": 0.0387,
1320
+ "num_input_tokens_seen": 155405312,
1321
+ "step": 820
1322
+ },
1323
+ {
1324
+ "epoch": 3.166986564299424,
1325
+ "grad_norm": 0.5100105404853821,
1326
+ "learning_rate": 3.6169961612540645e-05,
1327
+ "loss": 0.0387,
1328
+ "num_input_tokens_seen": 156361984,
1329
+ "step": 825
1330
+ },
1331
+ {
1332
+ "epoch": 3.1861804222648753,
1333
+ "grad_norm": 0.5485111474990845,
1334
+ "learning_rate": 3.6019573379729643e-05,
1335
+ "loss": 0.0378,
1336
+ "num_input_tokens_seen": 157296768,
1337
+ "step": 830
1338
+ },
1339
+ {
1340
+ "epoch": 3.2053742802303264,
1341
+ "grad_norm": 0.5596518516540527,
1342
+ "learning_rate": 3.586868858753921e-05,
1343
+ "loss": 0.0376,
1344
+ "num_input_tokens_seen": 158262144,
1345
+ "step": 835
1346
+ },
1347
+ {
1348
+ "epoch": 3.224568138195777,
1349
+ "grad_norm": 0.5062248110771179,
1350
+ "learning_rate": 3.5717314035076355e-05,
1351
+ "loss": 0.0394,
1352
+ "num_input_tokens_seen": 159213952,
1353
+ "step": 840
1354
+ },
1355
+ {
1356
+ "epoch": 3.2437619961612283,
1357
+ "grad_norm": 0.560895562171936,
1358
+ "learning_rate": 3.556545654351749e-05,
1359
+ "loss": 0.0412,
1360
+ "num_input_tokens_seen": 160142208,
1361
+ "step": 845
1362
+ },
1363
+ {
1364
+ "epoch": 3.2629558541266794,
1365
+ "grad_norm": 0.5781249403953552,
1366
+ "learning_rate": 3.5413122955801005e-05,
1367
+ "loss": 0.0379,
1368
+ "num_input_tokens_seen": 161101824,
1369
+ "step": 850
1370
+ },
1371
+ {
1372
+ "epoch": 3.2821497120921306,
1373
+ "grad_norm": 0.5329350233078003,
1374
+ "learning_rate": 3.526032013631893e-05,
1375
+ "loss": 0.0389,
1376
+ "num_input_tokens_seen": 162062592,
1377
+ "step": 855
1378
+ },
1379
+ {
1380
+ "epoch": 3.3013435700575817,
1381
+ "grad_norm": 0.5311657190322876,
1382
+ "learning_rate": 3.510705497060762e-05,
1383
+ "loss": 0.0407,
1384
+ "num_input_tokens_seen": 163008896,
1385
+ "step": 860
1386
+ },
1387
+ {
1388
+ "epoch": 3.320537428023033,
1389
+ "grad_norm": 0.5238428115844727,
1390
+ "learning_rate": 3.4953334365037525e-05,
1391
+ "loss": 0.0396,
1392
+ "num_input_tokens_seen": 163957632,
1393
+ "step": 865
1394
+ },
1395
+ {
1396
+ "epoch": 3.3397312859884836,
1397
+ "grad_norm": 0.5340429544448853,
1398
+ "learning_rate": 3.479916524650188e-05,
1399
+ "loss": 0.0398,
1400
+ "num_input_tokens_seen": 164903680,
1401
+ "step": 870
1402
+ },
1403
+ {
1404
+ "epoch": 3.3589251439539347,
1405
+ "grad_norm": 0.5380253195762634,
1406
+ "learning_rate": 3.4644554562104634e-05,
1407
+ "loss": 0.0378,
1408
+ "num_input_tokens_seen": 165832960,
1409
+ "step": 875
1410
+ },
1411
+ {
1412
+ "epoch": 3.378119001919386,
1413
+ "grad_norm": 0.5895189046859741,
1414
+ "learning_rate": 3.4489509278847414e-05,
1415
+ "loss": 0.0386,
1416
+ "num_input_tokens_seen": 166790016,
1417
+ "step": 880
1418
+ },
1419
+ {
1420
+ "epoch": 3.397312859884837,
1421
+ "grad_norm": 0.5654773712158203,
1422
+ "learning_rate": 3.433403638331553e-05,
1423
+ "loss": 0.0392,
1424
+ "num_input_tokens_seen": 167751808,
1425
+ "step": 885
1426
+ },
1427
+ {
1428
+ "epoch": 3.4165067178502877,
1429
+ "grad_norm": 0.5874072909355164,
1430
+ "learning_rate": 3.417814288136319e-05,
1431
+ "loss": 0.039,
1432
+ "num_input_tokens_seen": 168697984,
1433
+ "step": 890
1434
+ },
1435
+ {
1436
+ "epoch": 3.435700575815739,
1437
+ "grad_norm": 0.5615460276603699,
1438
+ "learning_rate": 3.4021835797797804e-05,
1439
+ "loss": 0.0397,
1440
+ "num_input_tokens_seen": 169634432,
1441
+ "step": 895
1442
+ },
1443
+ {
1444
+ "epoch": 3.45489443378119,
1445
+ "grad_norm": 0.5442382097244263,
1446
+ "learning_rate": 3.386512217606339e-05,
1447
+ "loss": 0.0408,
1448
+ "num_input_tokens_seen": 170583808,
1449
+ "step": 900
1450
+ },
1451
+ {
1452
+ "epoch": 3.474088291746641,
1453
+ "grad_norm": 0.5456710457801819,
1454
+ "learning_rate": 3.370800907792325e-05,
1455
+ "loss": 0.0393,
1456
+ "num_input_tokens_seen": 171575808,
1457
+ "step": 905
1458
+ },
1459
+ {
1460
+ "epoch": 3.4932821497120923,
1461
+ "grad_norm": 0.6690710186958313,
1462
+ "learning_rate": 3.355050358314172e-05,
1463
+ "loss": 0.0399,
1464
+ "num_input_tokens_seen": 172516224,
1465
+ "step": 910
1466
+ },
1467
+ {
1468
+ "epoch": 3.5124760076775434,
1469
+ "grad_norm": 0.5574905276298523,
1470
+ "learning_rate": 3.339261278916512e-05,
1471
+ "loss": 0.0382,
1472
+ "num_input_tokens_seen": 173462784,
1473
+ "step": 915
1474
+ },
1475
+ {
1476
+ "epoch": 3.531669865642994,
1477
+ "grad_norm": 0.6012794971466064,
1478
+ "learning_rate": 3.323434381080199e-05,
1479
+ "loss": 0.0393,
1480
+ "num_input_tokens_seen": 174417408,
1481
+ "step": 920
1482
+ },
1483
+ {
1484
+ "epoch": 3.5508637236084453,
1485
+ "grad_norm": 0.5913425087928772,
1486
+ "learning_rate": 3.307570377990245e-05,
1487
+ "loss": 0.0401,
1488
+ "num_input_tokens_seen": 175372160,
1489
+ "step": 925
1490
+ },
1491
+ {
1492
+ "epoch": 3.5700575815738964,
1493
+ "grad_norm": 0.5533618927001953,
1494
+ "learning_rate": 3.2916699845036816e-05,
1495
+ "loss": 0.0389,
1496
+ "num_input_tokens_seen": 176325376,
1497
+ "step": 930
1498
+ },
1499
+ {
1500
+ "epoch": 3.5892514395393476,
1501
+ "grad_norm": 0.5684598684310913,
1502
+ "learning_rate": 3.2757339171173506e-05,
1503
+ "loss": 0.0405,
1504
+ "num_input_tokens_seen": 177279232,
1505
+ "step": 935
1506
+ },
1507
+ {
1508
+ "epoch": 3.6084452975047983,
1509
+ "grad_norm": 0.5171712040901184,
1510
+ "learning_rate": 3.2597628939356175e-05,
1511
+ "loss": 0.0385,
1512
+ "num_input_tokens_seen": 178230656,
1513
+ "step": 940
1514
+ },
1515
+ {
1516
+ "epoch": 3.6276391554702494,
1517
+ "grad_norm": 0.5861181616783142,
1518
+ "learning_rate": 3.243757634638008e-05,
1519
+ "loss": 0.0406,
1520
+ "num_input_tokens_seen": 179165056,
1521
+ "step": 945
1522
+ },
1523
+ {
1524
+ "epoch": 3.6468330134357005,
1525
+ "grad_norm": 0.5875094532966614,
1526
+ "learning_rate": 3.227718860446782e-05,
1527
+ "loss": 0.0397,
1528
+ "num_input_tokens_seen": 180110464,
1529
+ "step": 950
1530
+ },
1531
+ {
1532
+ "epoch": 3.6660268714011517,
1533
+ "grad_norm": 0.518147885799408,
1534
+ "learning_rate": 3.211647294094437e-05,
1535
+ "loss": 0.0406,
1536
+ "num_input_tokens_seen": 181050368,
1537
+ "step": 955
1538
+ },
1539
+ {
1540
+ "epoch": 3.685220729366603,
1541
+ "grad_norm": 0.585909903049469,
1542
+ "learning_rate": 3.195543659791132e-05,
1543
+ "loss": 0.0399,
1544
+ "num_input_tokens_seen": 181985408,
1545
+ "step": 960
1546
+ },
1547
+ {
1548
+ "epoch": 3.704414587332054,
1549
+ "grad_norm": 0.5211689472198486,
1550
+ "learning_rate": 3.179408683192061e-05,
1551
+ "loss": 0.0403,
1552
+ "num_input_tokens_seen": 182932096,
1553
+ "step": 965
1554
+ },
1555
+ {
1556
+ "epoch": 3.7236084452975047,
1557
+ "grad_norm": 0.5516882538795471,
1558
+ "learning_rate": 3.163243091364752e-05,
1559
+ "loss": 0.0414,
1560
+ "num_input_tokens_seen": 183894528,
1561
+ "step": 970
1562
+ },
1563
+ {
1564
+ "epoch": 3.742802303262956,
1565
+ "grad_norm": 0.5610490441322327,
1566
+ "learning_rate": 3.147047612756302e-05,
1567
+ "loss": 0.04,
1568
+ "num_input_tokens_seen": 184841472,
1569
+ "step": 975
1570
+ },
1571
+ {
1572
+ "epoch": 3.761996161228407,
1573
+ "grad_norm": 0.5976274013519287,
1574
+ "learning_rate": 3.130822977160554e-05,
1575
+ "loss": 0.0411,
1576
+ "num_input_tokens_seen": 185803520,
1577
+ "step": 980
1578
+ },
1579
+ {
1580
+ "epoch": 3.781190019193858,
1581
+ "grad_norm": 0.5341821312904358,
1582
+ "learning_rate": 3.114569915685213e-05,
1583
+ "loss": 0.0416,
1584
+ "num_input_tokens_seen": 186746880,
1585
+ "step": 985
1586
+ },
1587
+ {
1588
+ "epoch": 3.800383877159309,
1589
+ "grad_norm": 0.5178216695785522,
1590
+ "learning_rate": 3.098289160718895e-05,
1591
+ "loss": 0.0418,
1592
+ "num_input_tokens_seen": 187703040,
1593
+ "step": 990
1594
+ },
1595
+ {
1596
+ "epoch": 3.81957773512476,
1597
+ "grad_norm": 0.6667547225952148,
1598
+ "learning_rate": 3.081981445898131e-05,
1599
+ "loss": 0.0399,
1600
+ "num_input_tokens_seen": 188650112,
1601
+ "step": 995
1602
+ },
1603
+ {
1604
+ "epoch": 3.838771593090211,
1605
+ "grad_norm": 0.6052145957946777,
1606
+ "learning_rate": 3.065647506074306e-05,
1607
+ "loss": 0.0369,
1608
+ "num_input_tokens_seen": 189577088,
1609
+ "step": 1000
1610
+ },
1611
+ {
1612
+ "epoch": 3.8579654510556622,
1613
+ "grad_norm": 0.5983301401138306,
1614
+ "learning_rate": 3.0492880772805433e-05,
1615
+ "loss": 0.0389,
1616
+ "num_input_tokens_seen": 190533376,
1617
+ "step": 1005
1618
+ },
1619
+ {
1620
+ "epoch": 3.8771593090211134,
1621
+ "grad_norm": 0.5493575930595398,
1622
+ "learning_rate": 3.03290389669854e-05,
1623
+ "loss": 0.039,
1624
+ "num_input_tokens_seen": 191484672,
1625
+ "step": 1010
1626
+ },
1627
+ {
1628
+ "epoch": 3.8963531669865645,
1629
+ "grad_norm": 0.5318569540977478,
1630
+ "learning_rate": 3.016495702625351e-05,
1631
+ "loss": 0.0401,
1632
+ "num_input_tokens_seen": 192425728,
1633
+ "step": 1015
1634
+ },
1635
+ {
1636
+ "epoch": 3.9155470249520152,
1637
+ "grad_norm": 0.5920220613479614,
1638
+ "learning_rate": 3.0000642344401113e-05,
1639
+ "loss": 0.0408,
1640
+ "num_input_tokens_seen": 193386368,
1641
+ "step": 1020
1642
+ },
1643
+ {
1644
+ "epoch": 3.9347408829174664,
1645
+ "grad_norm": 0.48210155963897705,
1646
+ "learning_rate": 2.983610232570728e-05,
1647
+ "loss": 0.0389,
1648
+ "num_input_tokens_seen": 194338560,
1649
+ "step": 1025
1650
+ },
1651
+ {
1652
+ "epoch": 3.9539347408829175,
1653
+ "grad_norm": 0.551249623298645,
1654
+ "learning_rate": 2.9671344384605127e-05,
1655
+ "loss": 0.0392,
1656
+ "num_input_tokens_seen": 195283968,
1657
+ "step": 1030
1658
+ },
1659
+ {
1660
+ "epoch": 3.9731285988483687,
1661
+ "grad_norm": 0.5769145488739014,
1662
+ "learning_rate": 2.950637594534765e-05,
1663
+ "loss": 0.0397,
1664
+ "num_input_tokens_seen": 196222080,
1665
+ "step": 1035
1666
+ },
1667
+ {
1668
+ "epoch": 3.9923224568138194,
1669
+ "grad_norm": 0.49872222542762756,
1670
+ "learning_rate": 2.9341204441673266e-05,
1671
+ "loss": 0.0396,
1672
+ "num_input_tokens_seen": 197174016,
1673
+ "step": 1040
1674
+ },
1675
+ {
1676
+ "epoch": 4.0115163147792705,
1677
+ "grad_norm": 0.32796505093574524,
1678
+ "learning_rate": 2.917583731647077e-05,
1679
+ "loss": 0.0271,
1680
+ "num_input_tokens_seen": 198129792,
1681
+ "step": 1045
1682
+ },
1683
+ {
1684
+ "epoch": 4.030710172744722,
1685
+ "grad_norm": 0.4801371395587921,
1686
+ "learning_rate": 2.9010282021444008e-05,
1687
+ "loss": 0.0188,
1688
+ "num_input_tokens_seen": 199088384,
1689
+ "step": 1050
1690
+ },
1691
+ {
1692
+ "epoch": 4.049904030710173,
1693
+ "grad_norm": 0.5048861503601074,
1694
+ "learning_rate": 2.8844546016776013e-05,
1695
+ "loss": 0.0195,
1696
+ "num_input_tokens_seen": 200040832,
1697
+ "step": 1055
1698
+ },
1699
+ {
1700
+ "epoch": 4.069097888675624,
1701
+ "grad_norm": 0.44816502928733826,
1702
+ "learning_rate": 2.8678636770792906e-05,
1703
+ "loss": 0.0174,
1704
+ "num_input_tokens_seen": 200993024,
1705
+ "step": 1060
1706
+ },
1707
+ {
1708
+ "epoch": 4.088291746641075,
1709
+ "grad_norm": 0.5177501440048218,
1710
+ "learning_rate": 2.851256175962732e-05,
1711
+ "loss": 0.0175,
1712
+ "num_input_tokens_seen": 201925376,
1713
+ "step": 1065
1714
+ },
1715
+ {
1716
+ "epoch": 4.107485604606526,
1717
+ "grad_norm": 0.44937270879745483,
1718
+ "learning_rate": 2.8346328466881545e-05,
1719
+ "loss": 0.0175,
1720
+ "num_input_tokens_seen": 202875520,
1721
+ "step": 1070
1722
+ },
1723
+ {
1724
+ "epoch": 4.126679462571977,
1725
+ "grad_norm": 0.46722468733787537,
1726
+ "learning_rate": 2.8179944383290274e-05,
1727
+ "loss": 0.0177,
1728
+ "num_input_tokens_seen": 203793536,
1729
+ "step": 1075
1730
+ },
1731
+ {
1732
+ "epoch": 4.145873320537428,
1733
+ "grad_norm": 0.47768163681030273,
1734
+ "learning_rate": 2.8013417006383076e-05,
1735
+ "loss": 0.0178,
1736
+ "num_input_tokens_seen": 204730496,
1737
+ "step": 1080
1738
+ },
1739
+ {
1740
+ "epoch": 4.165067178502879,
1741
+ "grad_norm": 0.5000248551368713,
1742
+ "learning_rate": 2.784675384014656e-05,
1743
+ "loss": 0.0183,
1744
+ "num_input_tokens_seen": 205692416,
1745
+ "step": 1085
1746
+ },
1747
+ {
1748
+ "epoch": 4.18426103646833,
1749
+ "grad_norm": 0.43874186277389526,
1750
+ "learning_rate": 2.7679962394686198e-05,
1751
+ "loss": 0.0163,
1752
+ "num_input_tokens_seen": 206628608,
1753
+ "step": 1090
1754
+ },
1755
+ {
1756
+ "epoch": 4.203454894433781,
1757
+ "grad_norm": 0.6000416874885559,
1758
+ "learning_rate": 2.751305018588793e-05,
1759
+ "loss": 0.0173,
1760
+ "num_input_tokens_seen": 207572096,
1761
+ "step": 1095
1762
+ },
1763
+ {
1764
+ "epoch": 4.222648752399232,
1765
+ "grad_norm": 0.5658661127090454,
1766
+ "learning_rate": 2.7346024735079486e-05,
1767
+ "loss": 0.0174,
1768
+ "num_input_tokens_seen": 208517376,
1769
+ "step": 1100
1770
+ },
1771
+ {
1772
+ "epoch": 4.241842610364683,
1773
+ "grad_norm": 0.47470784187316895,
1774
+ "learning_rate": 2.717889356869146e-05,
1775
+ "loss": 0.017,
1776
+ "num_input_tokens_seen": 209485056,
1777
+ "step": 1105
1778
+ },
1779
+ {
1780
+ "epoch": 4.2610364683301345,
1781
+ "grad_norm": 0.5162433385848999,
1782
+ "learning_rate": 2.7011664217918154e-05,
1783
+ "loss": 0.0176,
1784
+ "num_input_tokens_seen": 210415232,
1785
+ "step": 1110
1786
+ },
1787
+ {
1788
+ "epoch": 4.280230326295586,
1789
+ "grad_norm": 0.5180822014808655,
1790
+ "learning_rate": 2.684434421837821e-05,
1791
+ "loss": 0.018,
1792
+ "num_input_tokens_seen": 211362560,
1793
+ "step": 1115
1794
+ },
1795
+ {
1796
+ "epoch": 4.299424184261037,
1797
+ "grad_norm": 0.5464392304420471,
1798
+ "learning_rate": 2.667694110977506e-05,
1799
+ "loss": 0.0176,
1800
+ "num_input_tokens_seen": 212339456,
1801
+ "step": 1120
1802
+ },
1803
+ {
1804
+ "epoch": 4.318618042226488,
1805
+ "grad_norm": 0.5567265748977661,
1806
+ "learning_rate": 2.6509462435557152e-05,
1807
+ "loss": 0.0187,
1808
+ "num_input_tokens_seen": 213288320,
1809
+ "step": 1125
1810
+ },
1811
+ {
1812
+ "epoch": 4.337811900191938,
1813
+ "grad_norm": 0.5592502951622009,
1814
+ "learning_rate": 2.6341915742578037e-05,
1815
+ "loss": 0.0189,
1816
+ "num_input_tokens_seen": 214232576,
1817
+ "step": 1130
1818
+ },
1819
+ {
1820
+ "epoch": 4.357005758157389,
1821
+ "grad_norm": 0.53173828125,
1822
+ "learning_rate": 2.617430858075632e-05,
1823
+ "loss": 0.0174,
1824
+ "num_input_tokens_seen": 215180800,
1825
+ "step": 1135
1826
+ },
1827
+ {
1828
+ "epoch": 4.3761996161228405,
1829
+ "grad_norm": 0.4227728843688965,
1830
+ "learning_rate": 2.600664850273538e-05,
1831
+ "loss": 0.0175,
1832
+ "num_input_tokens_seen": 216116480,
1833
+ "step": 1140
1834
+ },
1835
+ {
1836
+ "epoch": 4.395393474088292,
1837
+ "grad_norm": 0.6441555023193359,
1838
+ "learning_rate": 2.5838943063543136e-05,
1839
+ "loss": 0.0174,
1840
+ "num_input_tokens_seen": 217079552,
1841
+ "step": 1145
1842
+ },
1843
+ {
1844
+ "epoch": 4.414587332053743,
1845
+ "grad_norm": 0.4902968108654022,
1846
+ "learning_rate": 2.5671199820251534e-05,
1847
+ "loss": 0.016,
1848
+ "num_input_tokens_seen": 218026496,
1849
+ "step": 1150
1850
+ },
1851
+ {
1852
+ "epoch": 4.433781190019194,
1853
+ "grad_norm": 0.46307000517845154,
1854
+ "learning_rate": 2.550342633163601e-05,
1855
+ "loss": 0.0171,
1856
+ "num_input_tokens_seen": 218977152,
1857
+ "step": 1155
1858
+ },
1859
+ {
1860
+ "epoch": 4.452975047984645,
1861
+ "grad_norm": 0.4390980899333954,
1862
+ "learning_rate": 2.5335630157834937e-05,
1863
+ "loss": 0.0178,
1864
+ "num_input_tokens_seen": 219918080,
1865
+ "step": 1160
1866
+ },
1867
+ {
1868
+ "epoch": 4.472168905950096,
1869
+ "grad_norm": 0.5380067825317383,
1870
+ "learning_rate": 2.5167818860008908e-05,
1871
+ "loss": 0.0182,
1872
+ "num_input_tokens_seen": 220859776,
1873
+ "step": 1165
1874
+ },
1875
+ {
1876
+ "epoch": 4.491362763915547,
1877
+ "grad_norm": 0.6842356324195862,
1878
+ "learning_rate": 2.5e-05,
1879
+ "loss": 0.0184,
1880
+ "num_input_tokens_seen": 221800192,
1881
+ "step": 1170
1882
+ },
1883
+ {
1884
+ "epoch": 4.510556621880998,
1885
+ "grad_norm": 0.5243207812309265,
1886
+ "learning_rate": 2.48321811399911e-05,
1887
+ "loss": 0.0182,
1888
+ "num_input_tokens_seen": 222755840,
1889
+ "step": 1175
1890
+ },
1891
+ {
1892
+ "epoch": 4.529750479846449,
1893
+ "grad_norm": 0.5043371915817261,
1894
+ "learning_rate": 2.4664369842165068e-05,
1895
+ "loss": 0.0182,
1896
+ "num_input_tokens_seen": 223683328,
1897
+ "step": 1180
1898
+ },
1899
+ {
1900
+ "epoch": 4.5489443378119,
1901
+ "grad_norm": 0.5096651315689087,
1902
+ "learning_rate": 2.4496573668363996e-05,
1903
+ "loss": 0.0192,
1904
+ "num_input_tokens_seen": 224617088,
1905
+ "step": 1185
1906
+ },
1907
+ {
1908
+ "epoch": 4.568138195777351,
1909
+ "grad_norm": 0.4918656349182129,
1910
+ "learning_rate": 2.4328800179748475e-05,
1911
+ "loss": 0.0189,
1912
+ "num_input_tokens_seen": 225549824,
1913
+ "step": 1190
1914
+ },
1915
+ {
1916
+ "epoch": 4.587332053742802,
1917
+ "grad_norm": 0.50703364610672,
1918
+ "learning_rate": 2.4161056936456873e-05,
1919
+ "loss": 0.0179,
1920
+ "num_input_tokens_seen": 226499968,
1921
+ "step": 1195
1922
+ },
1923
+ {
1924
+ "epoch": 4.606525911708253,
1925
+ "grad_norm": 0.535247266292572,
1926
+ "learning_rate": 2.399335149726463e-05,
1927
+ "loss": 0.0184,
1928
+ "num_input_tokens_seen": 227438592,
1929
+ "step": 1200
1930
+ },
1931
+ {
1932
+ "epoch": 4.6257197696737045,
1933
+ "grad_norm": 0.5735894441604614,
1934
+ "learning_rate": 2.3825691419243694e-05,
1935
+ "loss": 0.0173,
1936
+ "num_input_tokens_seen": 228392064,
1937
+ "step": 1205
1938
+ },
1939
+ {
1940
+ "epoch": 4.644913627639156,
1941
+ "grad_norm": 0.525362491607666,
1942
+ "learning_rate": 2.365808425742196e-05,
1943
+ "loss": 0.0172,
1944
+ "num_input_tokens_seen": 229337856,
1945
+ "step": 1210
1946
+ },
1947
+ {
1948
+ "epoch": 4.664107485604607,
1949
+ "grad_norm": 0.548953115940094,
1950
+ "learning_rate": 2.3490537564442847e-05,
1951
+ "loss": 0.0172,
1952
+ "num_input_tokens_seen": 230301184,
1953
+ "step": 1215
1954
+ },
1955
+ {
1956
+ "epoch": 4.683301343570058,
1957
+ "grad_norm": 0.543995201587677,
1958
+ "learning_rate": 2.3323058890224938e-05,
1959
+ "loss": 0.0183,
1960
+ "num_input_tokens_seen": 231239808,
1961
+ "step": 1220
1962
+ },
1963
+ {
1964
+ "epoch": 4.702495201535509,
1965
+ "grad_norm": 0.4721289873123169,
1966
+ "learning_rate": 2.3155655781621793e-05,
1967
+ "loss": 0.017,
1968
+ "num_input_tokens_seen": 232200960,
1969
+ "step": 1225
1970
+ },
1971
+ {
1972
+ "epoch": 4.72168905950096,
1973
+ "grad_norm": 0.5055978298187256,
1974
+ "learning_rate": 2.2988335782081855e-05,
1975
+ "loss": 0.018,
1976
+ "num_input_tokens_seen": 233170176,
1977
+ "step": 1230
1978
+ },
1979
+ {
1980
+ "epoch": 4.74088291746641,
1981
+ "grad_norm": 0.5819576382637024,
1982
+ "learning_rate": 2.2821106431308544e-05,
1983
+ "loss": 0.0192,
1984
+ "num_input_tokens_seen": 234121984,
1985
+ "step": 1235
1986
+ },
1987
+ {
1988
+ "epoch": 4.760076775431862,
1989
+ "grad_norm": 0.5343950986862183,
1990
+ "learning_rate": 2.265397526492052e-05,
1991
+ "loss": 0.019,
1992
+ "num_input_tokens_seen": 235080448,
1993
+ "step": 1240
1994
+ },
1995
+ {
1996
+ "epoch": 4.779270633397313,
1997
+ "grad_norm": 0.5696731805801392,
1998
+ "learning_rate": 2.2486949814112077e-05,
1999
+ "loss": 0.0167,
2000
+ "num_input_tokens_seen": 236029440,
2001
+ "step": 1245
2002
+ },
2003
+ {
2004
+ "epoch": 4.798464491362764,
2005
+ "grad_norm": 0.513455331325531,
2006
+ "learning_rate": 2.2320037605313808e-05,
2007
+ "loss": 0.0172,
2008
+ "num_input_tokens_seen": 236989952,
2009
+ "step": 1250
2010
+ },
2011
+ {
2012
+ "epoch": 4.817658349328215,
2013
+ "grad_norm": 0.5600264072418213,
2014
+ "learning_rate": 2.2153246159853446e-05,
2015
+ "loss": 0.0178,
2016
+ "num_input_tokens_seen": 237952640,
2017
+ "step": 1255
2018
+ },
2019
+ {
2020
+ "epoch": 4.836852207293666,
2021
+ "grad_norm": 0.5633450746536255,
2022
+ "learning_rate": 2.1986582993616926e-05,
2023
+ "loss": 0.0179,
2024
+ "num_input_tokens_seen": 238887552,
2025
+ "step": 1260
2026
+ },
2027
+ {
2028
+ "epoch": 4.856046065259117,
2029
+ "grad_norm": 0.4715237021446228,
2030
+ "learning_rate": 2.1820055616709735e-05,
2031
+ "loss": 0.0163,
2032
+ "num_input_tokens_seen": 239837440,
2033
+ "step": 1265
2034
+ },
2035
+ {
2036
+ "epoch": 4.8752399232245685,
2037
+ "grad_norm": 0.6615576148033142,
2038
+ "learning_rate": 2.1653671533118468e-05,
2039
+ "loss": 0.019,
2040
+ "num_input_tokens_seen": 240771712,
2041
+ "step": 1270
2042
+ },
2043
+ {
2044
+ "epoch": 4.894433781190019,
2045
+ "grad_norm": 0.4694409668445587,
2046
+ "learning_rate": 2.148743824037269e-05,
2047
+ "loss": 0.0169,
2048
+ "num_input_tokens_seen": 241706496,
2049
+ "step": 1275
2050
+ },
2051
+ {
2052
+ "epoch": 4.91362763915547,
2053
+ "grad_norm": 0.5807188153266907,
2054
+ "learning_rate": 2.1321363229207096e-05,
2055
+ "loss": 0.0165,
2056
+ "num_input_tokens_seen": 242651520,
2057
+ "step": 1280
2058
+ },
2059
+ {
2060
+ "epoch": 4.932821497120921,
2061
+ "grad_norm": 0.5913830399513245,
2062
+ "learning_rate": 2.115545398322399e-05,
2063
+ "loss": 0.018,
2064
+ "num_input_tokens_seen": 243610880,
2065
+ "step": 1285
2066
+ },
2067
+ {
2068
+ "epoch": 4.952015355086372,
2069
+ "grad_norm": 0.47264307737350464,
2070
+ "learning_rate": 2.098971797855599e-05,
2071
+ "loss": 0.0171,
2072
+ "num_input_tokens_seen": 244553344,
2073
+ "step": 1290
2074
+ },
2075
+ {
2076
+ "epoch": 4.971209213051823,
2077
+ "grad_norm": 0.5455983281135559,
2078
+ "learning_rate": 2.0824162683529224e-05,
2079
+ "loss": 0.0178,
2080
+ "num_input_tokens_seen": 245489280,
2081
+ "step": 1295
2082
+ },
2083
+ {
2084
+ "epoch": 4.990403071017274,
2085
+ "grad_norm": 0.5236849784851074,
2086
+ "learning_rate": 2.0658795558326743e-05,
2087
+ "loss": 0.0177,
2088
+ "num_input_tokens_seen": 246430848,
2089
+ "step": 1300
2090
+ },
2091
+ {
2092
+ "epoch": 5.009596928982726,
2093
+ "grad_norm": 0.3665928840637207,
2094
+ "learning_rate": 2.0493624054652357e-05,
2095
+ "loss": 0.0129,
2096
+ "num_input_tokens_seen": 247392000,
2097
+ "step": 1305
2098
+ },
2099
+ {
2100
+ "epoch": 5.028790786948177,
2101
+ "grad_norm": 0.2873888313770294,
2102
+ "learning_rate": 2.0328655615394882e-05,
2103
+ "loss": 0.0076,
2104
+ "num_input_tokens_seen": 248329472,
2105
+ "step": 1310
2106
+ },
2107
+ {
2108
+ "epoch": 5.047984644913628,
2109
+ "grad_norm": 0.3191753029823303,
2110
+ "learning_rate": 2.016389767429272e-05,
2111
+ "loss": 0.0069,
2112
+ "num_input_tokens_seen": 249250688,
2113
+ "step": 1315
2114
+ },
2115
+ {
2116
+ "epoch": 5.067178502879079,
2117
+ "grad_norm": 0.3973754048347473,
2118
+ "learning_rate": 1.9999357655598893e-05,
2119
+ "loss": 0.0068,
2120
+ "num_input_tokens_seen": 250189312,
2121
+ "step": 1320
2122
+ },
2123
+ {
2124
+ "epoch": 5.08637236084453,
2125
+ "grad_norm": 0.27491700649261475,
2126
+ "learning_rate": 1.98350429737465e-05,
2127
+ "loss": 0.0063,
2128
+ "num_input_tokens_seen": 251125632,
2129
+ "step": 1325
2130
+ },
2131
+ {
2132
+ "epoch": 5.10556621880998,
2133
+ "grad_norm": 0.448505163192749,
2134
+ "learning_rate": 1.9670961033014605e-05,
2135
+ "loss": 0.0063,
2136
+ "num_input_tokens_seen": 252084224,
2137
+ "step": 1330
2138
+ },
2139
+ {
2140
+ "epoch": 5.1247600767754315,
2141
+ "grad_norm": 0.39973142743110657,
2142
+ "learning_rate": 1.950711922719458e-05,
2143
+ "loss": 0.0067,
2144
+ "num_input_tokens_seen": 253038080,
2145
+ "step": 1335
2146
+ },
2147
+ {
2148
+ "epoch": 5.143953934740883,
2149
+ "grad_norm": 0.3733747899532318,
2150
+ "learning_rate": 1.934352493925695e-05,
2151
+ "loss": 0.0059,
2152
+ "num_input_tokens_seen": 253959936,
2153
+ "step": 1340
2154
+ },
2155
+ {
2156
+ "epoch": 5.163147792706334,
2157
+ "grad_norm": 0.33044806122779846,
2158
+ "learning_rate": 1.9180185541018695e-05,
2159
+ "loss": 0.006,
2160
+ "num_input_tokens_seen": 254898176,
2161
+ "step": 1345
2162
+ },
2163
+ {
2164
+ "epoch": 5.182341650671785,
2165
+ "grad_norm": 0.39706701040267944,
2166
+ "learning_rate": 1.9017108392811065e-05,
2167
+ "loss": 0.006,
2168
+ "num_input_tokens_seen": 255850368,
2169
+ "step": 1350
2170
+ },
2171
+ {
2172
+ "epoch": 5.201535508637236,
2173
+ "grad_norm": 0.391462117433548,
2174
+ "learning_rate": 1.8854300843147875e-05,
2175
+ "loss": 0.0062,
2176
+ "num_input_tokens_seen": 256820096,
2177
+ "step": 1355
2178
+ },
2179
+ {
2180
+ "epoch": 5.220729366602687,
2181
+ "grad_norm": 0.41619014739990234,
2182
+ "learning_rate": 1.8691770228394456e-05,
2183
+ "loss": 0.0063,
2184
+ "num_input_tokens_seen": 257742080,
2185
+ "step": 1360
2186
+ },
2187
+ {
2188
+ "epoch": 5.239923224568138,
2189
+ "grad_norm": 0.3508637547492981,
2190
+ "learning_rate": 1.852952387243698e-05,
2191
+ "loss": 0.0054,
2192
+ "num_input_tokens_seen": 258705664,
2193
+ "step": 1365
2194
+ },
2195
+ {
2196
+ "epoch": 5.25911708253359,
2197
+ "grad_norm": 0.3165414333343506,
2198
+ "learning_rate": 1.8367569086352483e-05,
2199
+ "loss": 0.006,
2200
+ "num_input_tokens_seen": 259630464,
2201
+ "step": 1370
2202
+ },
2203
+ {
2204
+ "epoch": 5.278310940499041,
2205
+ "grad_norm": 0.31605008244514465,
2206
+ "learning_rate": 1.820591316807939e-05,
2207
+ "loss": 0.006,
2208
+ "num_input_tokens_seen": 260580352,
2209
+ "step": 1375
2210
+ },
2211
+ {
2212
+ "epoch": 5.297504798464491,
2213
+ "grad_norm": 0.235918790102005,
2214
+ "learning_rate": 1.8044563402088684e-05,
2215
+ "loss": 0.005,
2216
+ "num_input_tokens_seen": 261539072,
2217
+ "step": 1380
2218
+ },
2219
+ {
2220
+ "epoch": 5.316698656429942,
2221
+ "grad_norm": 0.37443870306015015,
2222
+ "learning_rate": 1.788352705905563e-05,
2223
+ "loss": 0.0049,
2224
+ "num_input_tokens_seen": 262481152,
2225
+ "step": 1385
2226
+ },
2227
+ {
2228
+ "epoch": 5.335892514395393,
2229
+ "grad_norm": 0.42907220125198364,
2230
+ "learning_rate": 1.7722811395532178e-05,
2231
+ "loss": 0.0052,
2232
+ "num_input_tokens_seen": 263416064,
2233
+ "step": 1390
2234
+ },
2235
+ {
2236
+ "epoch": 5.355086372360844,
2237
+ "grad_norm": 0.3673328161239624,
2238
+ "learning_rate": 1.756242365361993e-05,
2239
+ "loss": 0.0055,
2240
+ "num_input_tokens_seen": 264374016,
2241
+ "step": 1395
2242
+ },
2243
+ {
2244
+ "epoch": 5.3742802303262955,
2245
+ "grad_norm": 0.40815985202789307,
2246
+ "learning_rate": 1.740237106064383e-05,
2247
+ "loss": 0.0057,
2248
+ "num_input_tokens_seen": 265326976,
2249
+ "step": 1400
2250
+ },
2251
+ {
2252
+ "epoch": 5.393474088291747,
2253
+ "grad_norm": 0.3974185287952423,
2254
+ "learning_rate": 1.72426608288265e-05,
2255
+ "loss": 0.0053,
2256
+ "num_input_tokens_seen": 266300672,
2257
+ "step": 1405
2258
+ },
2259
+ {
2260
+ "epoch": 5.412667946257198,
2261
+ "grad_norm": 0.415446013212204,
2262
+ "learning_rate": 1.7083300154963193e-05,
2263
+ "loss": 0.006,
2264
+ "num_input_tokens_seen": 267247488,
2265
+ "step": 1410
2266
+ },
2267
+ {
2268
+ "epoch": 5.431861804222649,
2269
+ "grad_norm": 0.24956613779067993,
2270
+ "learning_rate": 1.6924296220097556e-05,
2271
+ "loss": 0.0054,
2272
+ "num_input_tokens_seen": 268195072,
2273
+ "step": 1415
2274
+ },
2275
+ {
2276
+ "epoch": 5.4510556621881,
2277
+ "grad_norm": 0.3614320456981659,
2278
+ "learning_rate": 1.6765656189198013e-05,
2279
+ "loss": 0.0058,
2280
+ "num_input_tokens_seen": 269156736,
2281
+ "step": 1420
2282
+ },
2283
+ {
2284
+ "epoch": 5.470249520153551,
2285
+ "grad_norm": 0.3081935942173004,
2286
+ "learning_rate": 1.6607387210834887e-05,
2287
+ "loss": 0.0059,
2288
+ "num_input_tokens_seen": 270111232,
2289
+ "step": 1425
2290
+ },
2291
+ {
2292
+ "epoch": 5.4894433781190015,
2293
+ "grad_norm": 0.616385281085968,
2294
+ "learning_rate": 1.6449496416858284e-05,
2295
+ "loss": 0.006,
2296
+ "num_input_tokens_seen": 271075328,
2297
+ "step": 1430
2298
+ },
2299
+ {
2300
+ "epoch": 5.508637236084453,
2301
+ "grad_norm": 0.2966477572917938,
2302
+ "learning_rate": 1.6291990922076745e-05,
2303
+ "loss": 0.0056,
2304
+ "num_input_tokens_seen": 272035200,
2305
+ "step": 1435
2306
+ },
2307
+ {
2308
+ "epoch": 5.527831094049904,
2309
+ "grad_norm": 0.3426896035671234,
2310
+ "learning_rate": 1.613487782393661e-05,
2311
+ "loss": 0.0055,
2312
+ "num_input_tokens_seen": 273006464,
2313
+ "step": 1440
2314
+ },
2315
+ {
2316
+ "epoch": 5.547024952015355,
2317
+ "grad_norm": 0.333395779132843,
2318
+ "learning_rate": 1.59781642022022e-05,
2319
+ "loss": 0.0052,
2320
+ "num_input_tokens_seen": 273947520,
2321
+ "step": 1445
2322
+ },
2323
+ {
2324
+ "epoch": 5.566218809980806,
2325
+ "grad_norm": 0.37326982617378235,
2326
+ "learning_rate": 1.582185711863681e-05,
2327
+ "loss": 0.0058,
2328
+ "num_input_tokens_seen": 274880640,
2329
+ "step": 1450
2330
+ },
2331
+ {
2332
+ "epoch": 5.585412667946257,
2333
+ "grad_norm": 0.47231632471084595,
2334
+ "learning_rate": 1.5665963616684476e-05,
2335
+ "loss": 0.006,
2336
+ "num_input_tokens_seen": 275832576,
2337
+ "step": 1455
2338
+ },
2339
+ {
2340
+ "epoch": 5.604606525911708,
2341
+ "grad_norm": 0.4276968240737915,
2342
+ "learning_rate": 1.5510490721152592e-05,
2343
+ "loss": 0.0059,
2344
+ "num_input_tokens_seen": 276766720,
2345
+ "step": 1460
2346
+ },
2347
+ {
2348
+ "epoch": 5.6238003838771595,
2349
+ "grad_norm": 0.3536054491996765,
2350
+ "learning_rate": 1.535544543789537e-05,
2351
+ "loss": 0.0057,
2352
+ "num_input_tokens_seen": 277722752,
2353
+ "step": 1465
2354
+ },
2355
+ {
2356
+ "epoch": 5.642994241842611,
2357
+ "grad_norm": 0.3875754475593567,
2358
+ "learning_rate": 1.5200834753498128e-05,
2359
+ "loss": 0.0055,
2360
+ "num_input_tokens_seen": 278668544,
2361
+ "step": 1470
2362
+ },
2363
+ {
2364
+ "epoch": 5.662188099808061,
2365
+ "grad_norm": 0.39483705163002014,
2366
+ "learning_rate": 1.5046665634962476e-05,
2367
+ "loss": 0.006,
2368
+ "num_input_tokens_seen": 279602944,
2369
+ "step": 1475
2370
+ },
2371
+ {
2372
+ "epoch": 5.681381957773512,
2373
+ "grad_norm": 0.3274906873703003,
2374
+ "learning_rate": 1.489294502939238e-05,
2375
+ "loss": 0.0058,
2376
+ "num_input_tokens_seen": 280545408,
2377
+ "step": 1480
2378
+ },
2379
+ {
2380
+ "epoch": 5.700575815738963,
2381
+ "grad_norm": 0.41820597648620605,
2382
+ "learning_rate": 1.4739679863681086e-05,
2383
+ "loss": 0.0052,
2384
+ "num_input_tokens_seen": 281486208,
2385
+ "step": 1485
2386
+ },
2387
+ {
2388
+ "epoch": 5.719769673704414,
2389
+ "grad_norm": 0.3023267984390259,
2390
+ "learning_rate": 1.4586877044199016e-05,
2391
+ "loss": 0.0056,
2392
+ "num_input_tokens_seen": 282428032,
2393
+ "step": 1490
2394
+ },
2395
+ {
2396
+ "epoch": 5.7389635316698655,
2397
+ "grad_norm": 0.40845391154289246,
2398
+ "learning_rate": 1.443454345648252e-05,
2399
+ "loss": 0.0061,
2400
+ "num_input_tokens_seen": 283387264,
2401
+ "step": 1495
2402
+ },
2403
+ {
2404
+ "epoch": 5.758157389635317,
2405
+ "grad_norm": 0.2751927971839905,
2406
+ "learning_rate": 1.4282685964923642e-05,
2407
+ "loss": 0.0058,
2408
+ "num_input_tokens_seen": 284347008,
2409
+ "step": 1500
2410
+ },
2411
+ {
2412
+ "epoch": 5.777351247600768,
2413
+ "grad_norm": 0.39462777972221375,
2414
+ "learning_rate": 1.4131311412460796e-05,
2415
+ "loss": 0.0061,
2416
+ "num_input_tokens_seen": 285271424,
2417
+ "step": 1505
2418
+ },
2419
+ {
2420
+ "epoch": 5.796545105566219,
2421
+ "grad_norm": 0.3681143522262573,
2422
+ "learning_rate": 1.398042662027035e-05,
2423
+ "loss": 0.0055,
2424
+ "num_input_tokens_seen": 286222208,
2425
+ "step": 1510
2426
+ },
2427
+ {
2428
+ "epoch": 5.81573896353167,
2429
+ "grad_norm": 0.3678882122039795,
2430
+ "learning_rate": 1.3830038387459354e-05,
2431
+ "loss": 0.0056,
2432
+ "num_input_tokens_seen": 287186304,
2433
+ "step": 1515
2434
+ },
2435
+ {
2436
+ "epoch": 5.834932821497121,
2437
+ "grad_norm": 0.3934548795223236,
2438
+ "learning_rate": 1.3680153490759073e-05,
2439
+ "loss": 0.0055,
2440
+ "num_input_tokens_seen": 288142848,
2441
+ "step": 1520
2442
+ },
2443
+ {
2444
+ "epoch": 5.854126679462572,
2445
+ "grad_norm": 0.3608354926109314,
2446
+ "learning_rate": 1.3530778684219648e-05,
2447
+ "loss": 0.0055,
2448
+ "num_input_tokens_seen": 289076608,
2449
+ "step": 1525
2450
+ },
2451
+ {
2452
+ "epoch": 5.8733205374280235,
2453
+ "grad_norm": 0.3579324781894684,
2454
+ "learning_rate": 1.3381920698905787e-05,
2455
+ "loss": 0.006,
2456
+ "num_input_tokens_seen": 290014848,
2457
+ "step": 1530
2458
+ },
2459
+ {
2460
+ "epoch": 5.892514395393474,
2461
+ "grad_norm": 0.45630770921707153,
2462
+ "learning_rate": 1.3233586242593387e-05,
2463
+ "loss": 0.0056,
2464
+ "num_input_tokens_seen": 290956928,
2465
+ "step": 1535
2466
+ },
2467
+ {
2468
+ "epoch": 5.911708253358925,
2469
+ "grad_norm": 0.48819243907928467,
2470
+ "learning_rate": 1.3085781999467303e-05,
2471
+ "loss": 0.0059,
2472
+ "num_input_tokens_seen": 291889408,
2473
+ "step": 1540
2474
+ },
2475
+ {
2476
+ "epoch": 5.930902111324376,
2477
+ "grad_norm": 0.39040514826774597,
2478
+ "learning_rate": 1.293851462982017e-05,
2479
+ "loss": 0.0056,
2480
+ "num_input_tokens_seen": 292832768,
2481
+ "step": 1545
2482
+ },
2483
+ {
2484
+ "epoch": 5.950095969289827,
2485
+ "grad_norm": 0.33169373869895935,
2486
+ "learning_rate": 1.2791790769752232e-05,
2487
+ "loss": 0.0054,
2488
+ "num_input_tokens_seen": 293767040,
2489
+ "step": 1550
2490
+ },
2491
+ {
2492
+ "epoch": 5.969289827255278,
2493
+ "grad_norm": 0.3252679705619812,
2494
+ "learning_rate": 1.2645617030872328e-05,
2495
+ "loss": 0.0049,
2496
+ "num_input_tokens_seen": 294750208,
2497
+ "step": 1555
2498
+ },
2499
+ {
2500
+ "epoch": 5.9884836852207295,
2501
+ "grad_norm": 0.35827863216400146,
2502
+ "learning_rate": 1.2500000000000006e-05,
2503
+ "loss": 0.0051,
2504
+ "num_input_tokens_seen": 295721088,
2505
+ "step": 1560
2506
+ },
2507
+ {
2508
+ "epoch": 6.007677543186181,
2509
+ "grad_norm": 0.2643219530582428,
2510
+ "learning_rate": 1.2354946238868631e-05,
2511
+ "loss": 0.0037,
2512
+ "num_input_tokens_seen": 296669184,
2513
+ "step": 1565
2514
+ },
2515
+ {
2516
+ "epoch": 6.026871401151632,
2517
+ "grad_norm": 0.16359519958496094,
2518
+ "learning_rate": 1.2210462283829755e-05,
2519
+ "loss": 0.0021,
2520
+ "num_input_tokens_seen": 297624448,
2521
+ "step": 1570
2522
+ },
2523
+ {
2524
+ "epoch": 6.046065259117083,
2525
+ "grad_norm": 0.17375914752483368,
2526
+ "learning_rate": 1.2066554645558578e-05,
2527
+ "loss": 0.0019,
2528
+ "num_input_tokens_seen": 298558080,
2529
+ "step": 1575
2530
+ },
2531
+ {
2532
+ "epoch": 6.065259117082533,
2533
+ "grad_norm": 0.16882829368114471,
2534
+ "learning_rate": 1.1923229808760564e-05,
2535
+ "loss": 0.002,
2536
+ "num_input_tokens_seen": 299492352,
2537
+ "step": 1580
2538
+ },
2539
+ {
2540
+ "epoch": 6.084452975047984,
2541
+ "grad_norm": 0.15032212436199188,
2542
+ "learning_rate": 1.1780494231879183e-05,
2543
+ "loss": 0.0017,
2544
+ "num_input_tokens_seen": 300446976,
2545
+ "step": 1585
2546
+ },
2547
+ {
2548
+ "epoch": 6.1036468330134355,
2549
+ "grad_norm": 0.1795198768377304,
2550
+ "learning_rate": 1.1638354346804971e-05,
2551
+ "loss": 0.0016,
2552
+ "num_input_tokens_seen": 301379328,
2553
+ "step": 1590
2554
+ },
2555
+ {
2556
+ "epoch": 6.122840690978887,
2557
+ "grad_norm": 0.19939038157463074,
2558
+ "learning_rate": 1.1496816558585622e-05,
2559
+ "loss": 0.0017,
2560
+ "num_input_tokens_seen": 302306944,
2561
+ "step": 1595
2562
+ },
2563
+ {
2564
+ "epoch": 6.142034548944338,
2565
+ "grad_norm": 0.13161912560462952,
2566
+ "learning_rate": 1.1355887245137383e-05,
2567
+ "loss": 0.0014,
2568
+ "num_input_tokens_seen": 303246848,
2569
+ "step": 1600
2570
+ },
2571
+ {
2572
+ "epoch": 6.161228406909789,
2573
+ "grad_norm": 0.12050630152225494,
2574
+ "learning_rate": 1.121557275695771e-05,
2575
+ "loss": 0.0018,
2576
+ "num_input_tokens_seen": 304181248,
2577
+ "step": 1605
2578
+ },
2579
+ {
2580
+ "epoch": 6.18042226487524,
2581
+ "grad_norm": 0.1479523628950119,
2582
+ "learning_rate": 1.1075879416839023e-05,
2583
+ "loss": 0.002,
2584
+ "num_input_tokens_seen": 305139200,
2585
+ "step": 1610
2586
+ },
2587
+ {
2588
+ "epoch": 6.199616122840691,
2589
+ "grad_norm": 0.2239997535943985,
2590
+ "learning_rate": 1.093681351958383e-05,
2591
+ "loss": 0.0017,
2592
+ "num_input_tokens_seen": 306099328,
2593
+ "step": 1615
2594
+ },
2595
+ {
2596
+ "epoch": 6.218809980806142,
2597
+ "grad_norm": 0.1326994150876999,
2598
+ "learning_rate": 1.0798381331721109e-05,
2599
+ "loss": 0.0016,
2600
+ "num_input_tokens_seen": 307053568,
2601
+ "step": 1620
2602
+ },
2603
+ {
2604
+ "epoch": 6.2380038387715935,
2605
+ "grad_norm": 0.16336026787757874,
2606
+ "learning_rate": 1.0660589091223855e-05,
2607
+ "loss": 0.0016,
2608
+ "num_input_tokens_seen": 308003200,
2609
+ "step": 1625
2610
+ },
2611
+ {
2612
+ "epoch": 6.257197696737044,
2613
+ "grad_norm": 0.19519683718681335,
2614
+ "learning_rate": 1.052344300722803e-05,
2615
+ "loss": 0.0016,
2616
+ "num_input_tokens_seen": 308958720,
2617
+ "step": 1630
2618
+ },
2619
+ {
2620
+ "epoch": 6.276391554702495,
2621
+ "grad_norm": 0.19589100778102875,
2622
+ "learning_rate": 1.0386949259752785e-05,
2623
+ "loss": 0.0017,
2624
+ "num_input_tokens_seen": 309904384,
2625
+ "step": 1635
2626
+ },
2627
+ {
2628
+ "epoch": 6.295585412667946,
2629
+ "grad_norm": 0.1591753512620926,
2630
+ "learning_rate": 1.0251113999421935e-05,
2631
+ "loss": 0.0017,
2632
+ "num_input_tokens_seen": 310861568,
2633
+ "step": 1640
2634
+ },
2635
+ {
2636
+ "epoch": 6.314779270633397,
2637
+ "grad_norm": 0.1489296555519104,
2638
+ "learning_rate": 1.0115943347186826e-05,
2639
+ "loss": 0.0015,
2640
+ "num_input_tokens_seen": 311800064,
2641
+ "step": 1645
2642
+ },
2643
+ {
2644
+ "epoch": 6.333973128598848,
2645
+ "grad_norm": 0.1340964287519455,
2646
+ "learning_rate": 9.981443394050525e-06,
2647
+ "loss": 0.0014,
2648
+ "num_input_tokens_seen": 312742656,
2649
+ "step": 1650
2650
+ },
2651
+ {
2652
+ "epoch": 6.3531669865642995,
2653
+ "grad_norm": 0.4749620258808136,
2654
+ "learning_rate": 9.847620200793343e-06,
2655
+ "loss": 0.0016,
2656
+ "num_input_tokens_seen": 313683840,
2657
+ "step": 1655
2658
+ },
2659
+ {
2660
+ "epoch": 6.372360844529751,
2661
+ "grad_norm": 0.27900460362434387,
2662
+ "learning_rate": 9.714479797699694e-06,
2663
+ "loss": 0.0015,
2664
+ "num_input_tokens_seen": 314630400,
2665
+ "step": 1660
2666
+ },
2667
+ {
2668
+ "epoch": 6.391554702495202,
2669
+ "grad_norm": 0.15642359852790833,
2670
+ "learning_rate": 9.582028184286423e-06,
2671
+ "loss": 0.0016,
2672
+ "num_input_tokens_seen": 315612544,
2673
+ "step": 1665
2674
+ },
2675
+ {
2676
+ "epoch": 6.410748560460653,
2677
+ "grad_norm": 0.2352278232574463,
2678
+ "learning_rate": 9.450271329032404e-06,
2679
+ "loss": 0.0016,
2680
+ "num_input_tokens_seen": 316564224,
2681
+ "step": 1670
2682
+ },
2683
+ {
2684
+ "epoch": 6.429942418426104,
2685
+ "grad_norm": 0.2565127909183502,
2686
+ "learning_rate": 9.3192151691096e-06,
2687
+ "loss": 0.0018,
2688
+ "num_input_tokens_seen": 317537024,
2689
+ "step": 1675
2690
+ },
2691
+ {
2692
+ "epoch": 6.449136276391554,
2693
+ "grad_norm": 0.23012320697307587,
2694
+ "learning_rate": 9.18886561011557e-06,
2695
+ "loss": 0.0016,
2696
+ "num_input_tokens_seen": 318482944,
2697
+ "step": 1680
2698
+ },
2699
+ {
2700
+ "epoch": 6.468330134357005,
2701
+ "grad_norm": 0.15872737765312195,
2702
+ "learning_rate": 9.059228525807296e-06,
2703
+ "loss": 0.0015,
2704
+ "num_input_tokens_seen": 319438848,
2705
+ "step": 1685
2706
+ },
2707
+ {
2708
+ "epoch": 6.487523992322457,
2709
+ "grad_norm": 0.1375139206647873,
2710
+ "learning_rate": 8.930309757836517e-06,
2711
+ "loss": 0.0016,
2712
+ "num_input_tokens_seen": 320388736,
2713
+ "step": 1690
2714
+ },
2715
+ {
2716
+ "epoch": 6.506717850287908,
2717
+ "grad_norm": 0.12399590760469437,
2718
+ "learning_rate": 8.802115115486535e-06,
2719
+ "loss": 0.0013,
2720
+ "num_input_tokens_seen": 321354880,
2721
+ "step": 1695
2722
+ },
2723
+ {
2724
+ "epoch": 6.525911708253359,
2725
+ "grad_norm": 0.23495157063007355,
2726
+ "learning_rate": 8.67465037541038e-06,
2727
+ "loss": 0.0016,
2728
+ "num_input_tokens_seen": 322301952,
2729
+ "step": 1700
2730
+ },
2731
+ {
2732
+ "epoch": 6.54510556621881,
2733
+ "grad_norm": 0.15574663877487183,
2734
+ "learning_rate": 8.54792128137053e-06,
2735
+ "loss": 0.0014,
2736
+ "num_input_tokens_seen": 323246208,
2737
+ "step": 1705
2738
+ },
2739
+ {
2740
+ "epoch": 6.564299424184261,
2741
+ "grad_norm": 0.1617659628391266,
2742
+ "learning_rate": 8.421933543980126e-06,
2743
+ "loss": 0.0015,
2744
+ "num_input_tokens_seen": 324191616,
2745
+ "step": 1710
2746
+ },
2747
+ {
2748
+ "epoch": 6.583493282149712,
2749
+ "grad_norm": 0.14752747118473053,
2750
+ "learning_rate": 8.29669284044557e-06,
2751
+ "loss": 0.0017,
2752
+ "num_input_tokens_seen": 325125888,
2753
+ "step": 1715
2754
+ },
2755
+ {
2756
+ "epoch": 6.6026871401151634,
2757
+ "grad_norm": 0.2986501455307007,
2758
+ "learning_rate": 8.172204814310742e-06,
2759
+ "loss": 0.0015,
2760
+ "num_input_tokens_seen": 326070784,
2761
+ "step": 1720
2762
+ },
2763
+ {
2764
+ "epoch": 6.621880998080615,
2765
+ "grad_norm": 0.23812003433704376,
2766
+ "learning_rate": 8.048475075202727e-06,
2767
+ "loss": 0.0017,
2768
+ "num_input_tokens_seen": 326995712,
2769
+ "step": 1725
2770
+ },
2771
+ {
2772
+ "epoch": 6.641074856046066,
2773
+ "grad_norm": 0.13329505920410156,
2774
+ "learning_rate": 7.92550919857896e-06,
2775
+ "loss": 0.0019,
2776
+ "num_input_tokens_seen": 327954816,
2777
+ "step": 1730
2778
+ },
2779
+ {
2780
+ "epoch": 6.660268714011516,
2781
+ "grad_norm": 0.10942558199167252,
2782
+ "learning_rate": 7.803312725476031e-06,
2783
+ "loss": 0.0016,
2784
+ "num_input_tokens_seen": 328909184,
2785
+ "step": 1735
2786
+ },
2787
+ {
2788
+ "epoch": 6.679462571976967,
2789
+ "grad_norm": 0.10470914840698242,
2790
+ "learning_rate": 7.681891162260015e-06,
2791
+ "loss": 0.0015,
2792
+ "num_input_tokens_seen": 329862144,
2793
+ "step": 1740
2794
+ },
2795
+ {
2796
+ "epoch": 6.698656429942418,
2797
+ "grad_norm": 0.2065214365720749,
2798
+ "learning_rate": 7.561249980378301e-06,
2799
+ "loss": 0.0018,
2800
+ "num_input_tokens_seen": 330812544,
2801
+ "step": 1745
2802
+ },
2803
+ {
2804
+ "epoch": 6.717850287907869,
2805
+ "grad_norm": 0.2046762853860855,
2806
+ "learning_rate": 7.441394616113062e-06,
2807
+ "loss": 0.0016,
2808
+ "num_input_tokens_seen": 331769216,
2809
+ "step": 1750
2810
+ },
2811
+ {
2812
+ "epoch": 6.737044145873321,
2813
+ "grad_norm": 0.17153848707675934,
2814
+ "learning_rate": 7.3223304703363135e-06,
2815
+ "loss": 0.0015,
2816
+ "num_input_tokens_seen": 332703744,
2817
+ "step": 1755
2818
+ },
2819
+ {
2820
+ "epoch": 6.756238003838772,
2821
+ "grad_norm": 0.11358804255723953,
2822
+ "learning_rate": 7.20406290826649e-06,
2823
+ "loss": 0.0015,
2824
+ "num_input_tokens_seen": 333635712,
2825
+ "step": 1760
2826
+ },
2827
+ {
2828
+ "epoch": 6.775431861804223,
2829
+ "grad_norm": 0.21972091495990753,
2830
+ "learning_rate": 7.086597259226707e-06,
2831
+ "loss": 0.0016,
2832
+ "num_input_tokens_seen": 334579968,
2833
+ "step": 1765
2834
+ },
2835
+ {
2836
+ "epoch": 6.794625719769674,
2837
+ "grad_norm": 0.16843383014202118,
2838
+ "learning_rate": 6.969938816404639e-06,
2839
+ "loss": 0.0017,
2840
+ "num_input_tokens_seen": 335506304,
2841
+ "step": 1770
2842
+ },
2843
+ {
2844
+ "epoch": 6.813819577735125,
2845
+ "grad_norm": 0.155124694108963,
2846
+ "learning_rate": 6.854092836613948e-06,
2847
+ "loss": 0.0019,
2848
+ "num_input_tokens_seen": 336457856,
2849
+ "step": 1775
2850
+ },
2851
+ {
2852
+ "epoch": 6.833013435700575,
2853
+ "grad_norm": 0.11480195820331573,
2854
+ "learning_rate": 6.739064540057424e-06,
2855
+ "loss": 0.0015,
2856
+ "num_input_tokens_seen": 337391616,
2857
+ "step": 1780
2858
+ },
2859
+ {
2860
+ "epoch": 6.8522072936660265,
2861
+ "grad_norm": 0.13995982706546783,
2862
+ "learning_rate": 6.624859110091791e-06,
2863
+ "loss": 0.0019,
2864
+ "num_input_tokens_seen": 338349568,
2865
+ "step": 1785
2866
+ },
2867
+ {
2868
+ "epoch": 6.871401151631478,
2869
+ "grad_norm": 0.2626807689666748,
2870
+ "learning_rate": 6.511481692994076e-06,
2871
+ "loss": 0.0017,
2872
+ "num_input_tokens_seen": 339293440,
2873
+ "step": 1790
2874
+ },
2875
+ {
2876
+ "epoch": 6.890595009596929,
2877
+ "grad_norm": 0.21995946764945984,
2878
+ "learning_rate": 6.3989373977297315e-06,
2879
+ "loss": 0.0012,
2880
+ "num_input_tokens_seen": 340271360,
2881
+ "step": 1795
2882
+ },
2883
+ {
2884
+ "epoch": 6.90978886756238,
2885
+ "grad_norm": 0.11761938780546188,
2886
+ "learning_rate": 6.28723129572247e-06,
2887
+ "loss": 0.0014,
2888
+ "num_input_tokens_seen": 341216384,
2889
+ "step": 1800
2890
+ },
2891
+ {
2892
+ "epoch": 6.928982725527831,
2893
+ "grad_norm": 0.14595161378383636,
2894
+ "learning_rate": 6.1763684206256525e-06,
2895
+ "loss": 0.0014,
2896
+ "num_input_tokens_seen": 342183808,
2897
+ "step": 1805
2898
+ },
2899
+ {
2900
+ "epoch": 6.948176583493282,
2901
+ "grad_norm": 0.1292518526315689,
2902
+ "learning_rate": 6.066353768095504e-06,
2903
+ "loss": 0.0014,
2904
+ "num_input_tokens_seen": 343123712,
2905
+ "step": 1810
2906
+ },
2907
+ {
2908
+ "epoch": 6.967370441458733,
2909
+ "grad_norm": 0.19953125715255737,
2910
+ "learning_rate": 5.957192295566022e-06,
2911
+ "loss": 0.0014,
2912
+ "num_input_tokens_seen": 344062976,
2913
+ "step": 1815
2914
+ },
2915
+ {
2916
+ "epoch": 6.9865642994241846,
2917
+ "grad_norm": 0.17268332839012146,
2918
+ "learning_rate": 5.848888922025553e-06,
2919
+ "loss": 0.0015,
2920
+ "num_input_tokens_seen": 345019904,
2921
+ "step": 1820
2922
+ },
2923
+ {
2924
+ "epoch": 7.005758157389636,
2925
+ "grad_norm": 0.0566362664103508,
2926
+ "learning_rate": 5.741448527795137e-06,
2927
+ "loss": 0.0011,
2928
+ "num_input_tokens_seen": 345976320,
2929
+ "step": 1825
2930
+ },
2931
+ {
2932
+ "epoch": 7.024952015355086,
2933
+ "grad_norm": 0.07774636894464493,
2934
+ "learning_rate": 5.634875954308638e-06,
2935
+ "loss": 0.0007,
2936
+ "num_input_tokens_seen": 346933120,
2937
+ "step": 1830
2938
+ },
2939
+ {
2940
+ "epoch": 7.044145873320537,
2941
+ "grad_norm": 0.09574442356824875,
2942
+ "learning_rate": 5.52917600389451e-06,
2943
+ "loss": 0.0008,
2944
+ "num_input_tokens_seen": 347883392,
2945
+ "step": 1835
2946
+ },
2947
+ {
2948
+ "epoch": 7.063339731285988,
2949
+ "grad_norm": 0.04345453530550003,
2950
+ "learning_rate": 5.424353439559446e-06,
2951
+ "loss": 0.0007,
2952
+ "num_input_tokens_seen": 348815616,
2953
+ "step": 1840
2954
+ },
2955
+ {
2956
+ "epoch": 7.082533589251439,
2957
+ "grad_norm": 0.08021701127290726,
2958
+ "learning_rate": 5.320412984773748e-06,
2959
+ "loss": 0.0007,
2960
+ "num_input_tokens_seen": 349761152,
2961
+ "step": 1845
2962
+ },
2963
+ {
2964
+ "epoch": 7.1017274472168905,
2965
+ "grad_norm": 0.04693225026130676,
2966
+ "learning_rate": 5.217359323258459e-06,
2967
+ "loss": 0.0007,
2968
+ "num_input_tokens_seen": 350714880,
2969
+ "step": 1850
2970
+ },
2971
+ {
2972
+ "epoch": 7.120921305182342,
2973
+ "grad_norm": 0.041134320199489594,
2974
+ "learning_rate": 5.115197098774302e-06,
2975
+ "loss": 0.0007,
2976
+ "num_input_tokens_seen": 351676544,
2977
+ "step": 1855
2978
+ },
2979
+ {
2980
+ "epoch": 7.140115163147793,
2981
+ "grad_norm": 0.06960324198007584,
2982
+ "learning_rate": 5.013930914912476e-06,
2983
+ "loss": 0.0007,
2984
+ "num_input_tokens_seen": 352604672,
2985
+ "step": 1860
2986
+ },
2987
+ {
2988
+ "epoch": 7.159309021113244,
2989
+ "grad_norm": 0.057376183569431305,
2990
+ "learning_rate": 4.913565334887135e-06,
2991
+ "loss": 0.0007,
2992
+ "num_input_tokens_seen": 353552640,
2993
+ "step": 1865
2994
+ },
2995
+ {
2996
+ "epoch": 7.178502879078695,
2997
+ "grad_norm": 0.04041101410984993,
2998
+ "learning_rate": 4.814104881329828e-06,
2999
+ "loss": 0.0006,
3000
+ "num_input_tokens_seen": 354511360,
3001
+ "step": 1870
3002
+ },
3003
+ {
3004
+ "epoch": 7.197696737044146,
3005
+ "grad_norm": 0.052469249814748764,
3006
+ "learning_rate": 4.715554036085673e-06,
3007
+ "loss": 0.0007,
3008
+ "num_input_tokens_seen": 355478144,
3009
+ "step": 1875
3010
+ },
3011
+ {
3012
+ "epoch": 7.2168905950095965,
3013
+ "grad_norm": 0.11581304669380188,
3014
+ "learning_rate": 4.617917240011394e-06,
3015
+ "loss": 0.0006,
3016
+ "num_input_tokens_seen": 356424448,
3017
+ "step": 1880
3018
+ },
3019
+ {
3020
+ "epoch": 7.236084452975048,
3021
+ "grad_norm": 0.04358832538127899,
3022
+ "learning_rate": 4.521198892775203e-06,
3023
+ "loss": 0.0006,
3024
+ "num_input_tokens_seen": 357374208,
3025
+ "step": 1885
3026
+ },
3027
+ {
3028
+ "epoch": 7.255278310940499,
3029
+ "grad_norm": 0.04860702529549599,
3030
+ "learning_rate": 4.425403352658591e-06,
3031
+ "loss": 0.0006,
3032
+ "num_input_tokens_seen": 358315392,
3033
+ "step": 1890
3034
+ },
3035
+ {
3036
+ "epoch": 7.27447216890595,
3037
+ "grad_norm": 0.05813557654619217,
3038
+ "learning_rate": 4.330534936359873e-06,
3039
+ "loss": 0.0007,
3040
+ "num_input_tokens_seen": 359280384,
3041
+ "step": 1895
3042
+ },
3043
+ {
3044
+ "epoch": 7.293666026871401,
3045
+ "grad_norm": 0.08444052934646606,
3046
+ "learning_rate": 4.236597918799709e-06,
3047
+ "loss": 0.0007,
3048
+ "num_input_tokens_seen": 360221440,
3049
+ "step": 1900
3050
+ },
3051
+ {
3052
+ "epoch": 7.312859884836852,
3053
+ "grad_norm": 0.09866166114807129,
3054
+ "learning_rate": 4.143596532928468e-06,
3055
+ "loss": 0.0006,
3056
+ "num_input_tokens_seen": 361175936,
3057
+ "step": 1905
3058
+ },
3059
+ {
3060
+ "epoch": 7.332053742802303,
3061
+ "grad_norm": 0.039118677377700806,
3062
+ "learning_rate": 4.051534969535472e-06,
3063
+ "loss": 0.0006,
3064
+ "num_input_tokens_seen": 362113280,
3065
+ "step": 1910
3066
+ },
3067
+ {
3068
+ "epoch": 7.3512476007677545,
3069
+ "grad_norm": 0.08323455601930618,
3070
+ "learning_rate": 3.960417377060152e-06,
3071
+ "loss": 0.0006,
3072
+ "num_input_tokens_seen": 363056512,
3073
+ "step": 1915
3074
+ },
3075
+ {
3076
+ "epoch": 7.370441458733206,
3077
+ "grad_norm": 0.03620649501681328,
3078
+ "learning_rate": 3.8702478614051355e-06,
3079
+ "loss": 0.0006,
3080
+ "num_input_tokens_seen": 364026752,
3081
+ "step": 1920
3082
+ },
3083
+ {
3084
+ "epoch": 7.389635316698657,
3085
+ "grad_norm": 0.06456654518842697,
3086
+ "learning_rate": 3.7810304857511914e-06,
3087
+ "loss": 0.0006,
3088
+ "num_input_tokens_seen": 364979456,
3089
+ "step": 1925
3090
+ },
3091
+ {
3092
+ "epoch": 7.408829174664108,
3093
+ "grad_norm": 0.057450417429208755,
3094
+ "learning_rate": 3.6927692703741634e-06,
3095
+ "loss": 0.0006,
3096
+ "num_input_tokens_seen": 365919488,
3097
+ "step": 1930
3098
+ },
3099
+ {
3100
+ "epoch": 7.428023032629558,
3101
+ "grad_norm": 0.10976872593164444,
3102
+ "learning_rate": 3.605468192463815e-06,
3103
+ "loss": 0.0006,
3104
+ "num_input_tokens_seen": 366871552,
3105
+ "step": 1935
3106
+ },
3107
+ {
3108
+ "epoch": 7.447216890595009,
3109
+ "grad_norm": 0.04592859372496605,
3110
+ "learning_rate": 3.5191311859445796e-06,
3111
+ "loss": 0.0006,
3112
+ "num_input_tokens_seen": 367824768,
3113
+ "step": 1940
3114
+ },
3115
+ {
3116
+ "epoch": 7.4664107485604605,
3117
+ "grad_norm": 0.08413061499595642,
3118
+ "learning_rate": 3.4337621412983274e-06,
3119
+ "loss": 0.0007,
3120
+ "num_input_tokens_seen": 368776704,
3121
+ "step": 1945
3122
+ },
3123
+ {
3124
+ "epoch": 7.485604606525912,
3125
+ "grad_norm": 0.06421375274658203,
3126
+ "learning_rate": 3.3493649053890326e-06,
3127
+ "loss": 0.0006,
3128
+ "num_input_tokens_seen": 369726848,
3129
+ "step": 1950
3130
+ },
3131
+ {
3132
+ "epoch": 7.504798464491363,
3133
+ "grad_norm": 0.04606764018535614,
3134
+ "learning_rate": 3.2659432812894296e-06,
3135
+ "loss": 0.0007,
3136
+ "num_input_tokens_seen": 370669184,
3137
+ "step": 1955
3138
+ },
3139
+ {
3140
+ "epoch": 7.523992322456814,
3141
+ "grad_norm": 0.14935247600078583,
3142
+ "learning_rate": 3.183501028109642e-06,
3143
+ "loss": 0.001,
3144
+ "num_input_tokens_seen": 371619072,
3145
+ "step": 1960
3146
+ },
3147
+ {
3148
+ "epoch": 7.543186180422265,
3149
+ "grad_norm": 0.03758896514773369,
3150
+ "learning_rate": 3.1020418608278035e-06,
3151
+ "loss": 0.0006,
3152
+ "num_input_tokens_seen": 372591104,
3153
+ "step": 1965
3154
+ },
3155
+ {
3156
+ "epoch": 7.562380038387716,
3157
+ "grad_norm": 0.08971104770898819,
3158
+ "learning_rate": 3.0215694501226384e-06,
3159
+ "loss": 0.0006,
3160
+ "num_input_tokens_seen": 373540352,
3161
+ "step": 1970
3162
+ },
3163
+ {
3164
+ "epoch": 7.581573896353167,
3165
+ "grad_norm": 0.049786727875471115,
3166
+ "learning_rate": 2.942087422208051e-06,
3167
+ "loss": 0.0006,
3168
+ "num_input_tokens_seen": 374494336,
3169
+ "step": 1975
3170
+ },
3171
+ {
3172
+ "epoch": 7.600767754318618,
3173
+ "grad_norm": 0.14331580698490143,
3174
+ "learning_rate": 2.8635993586697553e-06,
3175
+ "loss": 0.0007,
3176
+ "num_input_tokens_seen": 375443968,
3177
+ "step": 1980
3178
+ },
3179
+ {
3180
+ "epoch": 7.619961612284069,
3181
+ "grad_norm": 0.1280643343925476,
3182
+ "learning_rate": 2.7861087963038435e-06,
3183
+ "loss": 0.0007,
3184
+ "num_input_tokens_seen": 376398848,
3185
+ "step": 1985
3186
+ },
3187
+ {
3188
+ "epoch": 7.63915547024952,
3189
+ "grad_norm": 0.07448034733533859,
3190
+ "learning_rate": 2.70961922695743e-06,
3191
+ "loss": 0.0006,
3192
+ "num_input_tokens_seen": 377363072,
3193
+ "step": 1990
3194
+ },
3195
+ {
3196
+ "epoch": 7.658349328214971,
3197
+ "grad_norm": 0.047854091972112656,
3198
+ "learning_rate": 2.6341340973713187e-06,
3199
+ "loss": 0.0005,
3200
+ "num_input_tokens_seen": 378285440,
3201
+ "step": 1995
3202
+ },
3203
+ {
3204
+ "epoch": 7.677543186180422,
3205
+ "grad_norm": 0.05751164257526398,
3206
+ "learning_rate": 2.5596568090246548e-06,
3207
+ "loss": 0.0006,
3208
+ "num_input_tokens_seen": 379227776,
3209
+ "step": 2000
3210
+ },
3211
+ {
3212
+ "epoch": 7.696737044145873,
3213
+ "grad_norm": 0.09036415070295334,
3214
+ "learning_rate": 2.486190717981665e-06,
3215
+ "loss": 0.0008,
3216
+ "num_input_tokens_seen": 380168064,
3217
+ "step": 2005
3218
+ },
3219
+ {
3220
+ "epoch": 7.7159309021113245,
3221
+ "grad_norm": 0.0800870880484581,
3222
+ "learning_rate": 2.4137391347404476e-06,
3223
+ "loss": 0.0007,
3224
+ "num_input_tokens_seen": 381124736,
3225
+ "step": 2010
3226
+ },
3227
+ {
3228
+ "epoch": 7.735124760076776,
3229
+ "grad_norm": 0.055397044867277145,
3230
+ "learning_rate": 2.3423053240837515e-06,
3231
+ "loss": 0.0007,
3232
+ "num_input_tokens_seen": 382078592,
3233
+ "step": 2015
3234
+ },
3235
+ {
3236
+ "epoch": 7.754318618042227,
3237
+ "grad_norm": 0.06471221148967743,
3238
+ "learning_rate": 2.271892504931905e-06,
3239
+ "loss": 0.0006,
3240
+ "num_input_tokens_seen": 383012224,
3241
+ "step": 2020
3242
+ },
3243
+ {
3244
+ "epoch": 7.773512476007678,
3245
+ "grad_norm": 0.05552973225712776,
3246
+ "learning_rate": 2.2025038501977486e-06,
3247
+ "loss": 0.0007,
3248
+ "num_input_tokens_seen": 383955328,
3249
+ "step": 2025
3250
+ },
3251
+ {
3252
+ "epoch": 7.792706333973129,
3253
+ "grad_norm": 0.05302416905760765,
3254
+ "learning_rate": 2.1341424866436364e-06,
3255
+ "loss": 0.0006,
3256
+ "num_input_tokens_seen": 384918528,
3257
+ "step": 2030
3258
+ },
3259
+ {
3260
+ "epoch": 7.811900191938579,
3261
+ "grad_norm": 0.04085630923509598,
3262
+ "learning_rate": 2.0668114947405726e-06,
3263
+ "loss": 0.0006,
3264
+ "num_input_tokens_seen": 385865472,
3265
+ "step": 2035
3266
+ },
3267
+ {
3268
+ "epoch": 7.8310940499040305,
3269
+ "grad_norm": 0.11027319729328156,
3270
+ "learning_rate": 2.0005139085293945e-06,
3271
+ "loss": 0.0007,
3272
+ "num_input_tokens_seen": 386815488,
3273
+ "step": 2040
3274
+ },
3275
+ {
3276
+ "epoch": 7.850287907869482,
3277
+ "grad_norm": 0.10259139537811279,
3278
+ "learning_rate": 1.9352527154840345e-06,
3279
+ "loss": 0.0006,
3280
+ "num_input_tokens_seen": 387746176,
3281
+ "step": 2045
3282
+ },
3283
+ {
3284
+ "epoch": 7.869481765834933,
3285
+ "grad_norm": 0.08809423446655273,
3286
+ "learning_rate": 1.8710308563769124e-06,
3287
+ "loss": 0.0006,
3288
+ "num_input_tokens_seen": 388682752,
3289
+ "step": 2050
3290
+ },
3291
+ {
3292
+ "epoch": 7.888675623800384,
3293
+ "grad_norm": 0.04966364800930023,
3294
+ "learning_rate": 1.8078512251464286e-06,
3295
+ "loss": 0.0006,
3296
+ "num_input_tokens_seen": 389629056,
3297
+ "step": 2055
3298
+ },
3299
+ {
3300
+ "epoch": 7.907869481765835,
3301
+ "grad_norm": 0.17464539408683777,
3302
+ "learning_rate": 1.7457166687665449e-06,
3303
+ "loss": 0.0008,
3304
+ "num_input_tokens_seen": 390563584,
3305
+ "step": 2060
3306
+ },
3307
+ {
3308
+ "epoch": 7.927063339731286,
3309
+ "grad_norm": 0.04215671867132187,
3310
+ "learning_rate": 1.684629987118494e-06,
3311
+ "loss": 0.0006,
3312
+ "num_input_tokens_seen": 391511808,
3313
+ "step": 2065
3314
+ },
3315
+ {
3316
+ "epoch": 7.946257197696737,
3317
+ "grad_norm": 0.07717634737491608,
3318
+ "learning_rate": 1.624593932864632e-06,
3319
+ "loss": 0.0007,
3320
+ "num_input_tokens_seen": 392460032,
3321
+ "step": 2070
3322
+ },
3323
+ {
3324
+ "epoch": 7.9654510556621885,
3325
+ "grad_norm": 0.04429319128394127,
3326
+ "learning_rate": 1.5656112113243721e-06,
3327
+ "loss": 0.0006,
3328
+ "num_input_tokens_seen": 393403264,
3329
+ "step": 2075
3330
+ },
3331
+ {
3332
+ "epoch": 7.984644913627639,
3333
+ "grad_norm": 0.06878451257944107,
3334
+ "learning_rate": 1.5076844803522922e-06,
3335
+ "loss": 0.0006,
3336
+ "num_input_tokens_seen": 394339584,
3337
+ "step": 2080
3338
+ },
3339
+ {
3340
+ "epoch": 8.00383877159309,
3341
+ "grad_norm": 0.05545974150300026,
3342
+ "learning_rate": 1.4508163502183786e-06,
3343
+ "loss": 0.0006,
3344
+ "num_input_tokens_seen": 395291264,
3345
+ "step": 2085
3346
+ },
3347
+ {
3348
+ "epoch": 8.023032629558541,
3349
+ "grad_norm": 0.07342655211687088,
3350
+ "learning_rate": 1.3950093834903866e-06,
3351
+ "loss": 0.0005,
3352
+ "num_input_tokens_seen": 396246272,
3353
+ "step": 2090
3354
+ },
3355
+ {
3356
+ "epoch": 8.042226487523992,
3357
+ "grad_norm": 0.030074596405029297,
3358
+ "learning_rate": 1.340266094918366e-06,
3359
+ "loss": 0.0005,
3360
+ "num_input_tokens_seen": 397189376,
3361
+ "step": 2095
3362
+ },
3363
+ {
3364
+ "epoch": 8.061420345489443,
3365
+ "grad_norm": 0.0349307544529438,
3366
+ "learning_rate": 1.286588951321363e-06,
3367
+ "loss": 0.0004,
3368
+ "num_input_tokens_seen": 398128000,
3369
+ "step": 2100
3370
+ }
3371
+ ],
3372
+ "logging_steps": 5,
3373
+ "max_steps": 2340,
3374
+ "num_input_tokens_seen": 398128000,
3375
+ "num_train_epochs": 9,
3376
+ "save_steps": 10,
3377
+ "stateful_callbacks": {
3378
+ "TrainerControl": {
3379
+ "args": {
3380
+ "should_epoch_stop": false,
3381
+ "should_evaluate": false,
3382
+ "should_log": false,
3383
+ "should_save": true,
3384
+ "should_training_stop": false
3385
+ },
3386
+ "attributes": {}
3387
+ }
3388
+ },
3389
+ "total_flos": 3.201849892590846e+18,
3390
+ "train_batch_size": 16,
3391
+ "trial_name": null,
3392
+ "trial_params": null
3393
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9eb343fa9de0ecf90f35477907f4d0961c543f0aadbab5b9de4e6d0d2c99308b
3
+ size 5560