hardlyworking commited on
Commit
4693d77
·
verified ·
1 Parent(s): 75a3529

Training in progress, step 520, checkpoint

Browse files
.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-520/tokenizer.json filter=lfs diff=lfs merge=lfs -text
checkpoint-520/added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
checkpoint-520/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "eos_token_id": 151645,
8
+ "head_dim": 128,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 2560,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 9728,
13
+ "max_position_embeddings": 32768,
14
+ "max_window_layers": 36,
15
+ "model_type": "qwen3",
16
+ "num_attention_heads": 32,
17
+ "num_hidden_layers": 36,
18
+ "num_key_value_heads": 8,
19
+ "rms_norm_eps": 1e-06,
20
+ "rope_scaling": null,
21
+ "rope_theta": 1000000,
22
+ "sliding_window": null,
23
+ "tie_word_embeddings": true,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.51.3",
26
+ "use_cache": false,
27
+ "use_sliding_window": false,
28
+ "vocab_size": 151936
29
+ }
checkpoint-520/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": 151643,
5
+ "max_new_tokens": 2048,
6
+ "transformers_version": "4.51.3"
7
+ }
checkpoint-520/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-520/model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12552c83360cbb40b7a3c566e92efe5c9f577c79aed85bbd30d36bdda89cda26
3
+ size 4967215360
checkpoint-520/model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2424850b6570893df38c22b1d9af7dc852b1053913b4da209ba62a9803bd7cda
3
+ size 3077766632
checkpoint-520/model.safetensors.index.json ADDED
@@ -0,0 +1,405 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 8044936192
4
+ },
5
+ "weight_map": {
6
+ "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
7
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
8
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
9
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
10
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
11
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
12
+ "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
13
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
14
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
15
+ "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
16
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
17
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
18
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
19
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
20
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
21
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
22
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
23
+ "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
24
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
25
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
26
+ "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
27
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
28
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
29
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
30
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
31
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
32
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
33
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
34
+ "model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
35
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
36
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
37
+ "model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
38
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
39
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
40
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
41
+ "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
42
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
43
+ "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
44
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
45
+ "model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
46
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
47
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
48
+ "model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
49
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
50
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
51
+ "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
52
+ "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
53
+ "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
54
+ "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
55
+ "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
56
+ "model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
57
+ "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
58
+ "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
59
+ "model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
60
+ "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
61
+ "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
62
+ "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
63
+ "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
64
+ "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
65
+ "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
66
+ "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
67
+ "model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
68
+ "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
69
+ "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
70
+ "model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
71
+ "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
72
+ "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
73
+ "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
74
+ "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
75
+ "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
76
+ "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
77
+ "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
78
+ "model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
79
+ "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
80
+ "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
81
+ "model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
82
+ "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
83
+ "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
84
+ "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
85
+ "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
86
+ "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
87
+ "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
88
+ "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
89
+ "model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
90
+ "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
91
+ "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
92
+ "model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
93
+ "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
94
+ "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
95
+ "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
96
+ "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
97
+ "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
98
+ "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
99
+ "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
100
+ "model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
101
+ "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
102
+ "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
103
+ "model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
104
+ "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
105
+ "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
106
+ "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
107
+ "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
108
+ "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
109
+ "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
110
+ "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
111
+ "model.layers.17.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
112
+ "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
113
+ "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
114
+ "model.layers.17.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
115
+ "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
116
+ "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
117
+ "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
118
+ "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
119
+ "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
120
+ "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
121
+ "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
122
+ "model.layers.18.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
123
+ "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
124
+ "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
125
+ "model.layers.18.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
126
+ "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
127
+ "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
128
+ "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
129
+ "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
130
+ "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
131
+ "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
132
+ "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
133
+ "model.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
134
+ "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
135
+ "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
136
+ "model.layers.19.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
137
+ "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
138
+ "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
139
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
140
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
141
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
142
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
143
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
144
+ "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
145
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
146
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
147
+ "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
148
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
149
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
150
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
151
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
152
+ "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
153
+ "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
154
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
155
+ "model.layers.20.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
156
+ "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
157
+ "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
158
+ "model.layers.20.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
159
+ "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
160
+ "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
161
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
162
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
163
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
164
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
165
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
166
+ "model.layers.21.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
167
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
168
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
169
+ "model.layers.21.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
170
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
171
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
172
+ "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
173
+ "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
174
+ "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
175
+ "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
176
+ "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
177
+ "model.layers.22.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
178
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
179
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
180
+ "model.layers.22.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
181
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
182
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
183
+ "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
184
+ "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
185
+ "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
186
+ "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
187
+ "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
188
+ "model.layers.23.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
189
+ "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
190
+ "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
191
+ "model.layers.23.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
192
+ "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
193
+ "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
194
+ "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
195
+ "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
196
+ "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
197
+ "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
198
+ "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
199
+ "model.layers.24.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
200
+ "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
201
+ "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
202
+ "model.layers.24.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
203
+ "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
204
+ "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
205
+ "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
206
+ "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
207
+ "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
208
+ "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
209
+ "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
210
+ "model.layers.25.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
211
+ "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
212
+ "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
213
+ "model.layers.25.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
214
+ "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
215
+ "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
216
+ "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
217
+ "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
218
+ "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
219
+ "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
220
+ "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
221
+ "model.layers.26.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
222
+ "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
223
+ "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
224
+ "model.layers.26.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
225
+ "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
226
+ "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
227
+ "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
228
+ "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
229
+ "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
230
+ "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
231
+ "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
232
+ "model.layers.27.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
233
+ "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
234
+ "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
235
+ "model.layers.27.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
236
+ "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
237
+ "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
238
+ "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
239
+ "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
240
+ "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
241
+ "model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
242
+ "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
243
+ "model.layers.28.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
244
+ "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
245
+ "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
246
+ "model.layers.28.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
247
+ "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
248
+ "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
249
+ "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
250
+ "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
251
+ "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
252
+ "model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
253
+ "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
254
+ "model.layers.29.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
255
+ "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
256
+ "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
257
+ "model.layers.29.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
258
+ "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
259
+ "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
260
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
261
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
262
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
263
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
264
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
265
+ "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
266
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
267
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
268
+ "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
269
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
270
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
271
+ "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
272
+ "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
273
+ "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
274
+ "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
275
+ "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
276
+ "model.layers.30.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
277
+ "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
278
+ "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
279
+ "model.layers.30.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
280
+ "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
281
+ "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
282
+ "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
283
+ "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
284
+ "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
285
+ "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
286
+ "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
287
+ "model.layers.31.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
288
+ "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
289
+ "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
290
+ "model.layers.31.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
291
+ "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
292
+ "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
293
+ "model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
294
+ "model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
295
+ "model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
296
+ "model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
297
+ "model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
298
+ "model.layers.32.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
299
+ "model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
300
+ "model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
301
+ "model.layers.32.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
302
+ "model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
303
+ "model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
304
+ "model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
305
+ "model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
306
+ "model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
307
+ "model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
308
+ "model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
309
+ "model.layers.33.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
310
+ "model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
311
+ "model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
312
+ "model.layers.33.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
313
+ "model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
314
+ "model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
315
+ "model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
316
+ "model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
317
+ "model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
318
+ "model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
319
+ "model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
320
+ "model.layers.34.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
321
+ "model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
322
+ "model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
323
+ "model.layers.34.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
324
+ "model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
325
+ "model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
326
+ "model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
327
+ "model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
328
+ "model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
329
+ "model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
330
+ "model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
331
+ "model.layers.35.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
332
+ "model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
333
+ "model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
334
+ "model.layers.35.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
335
+ "model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
336
+ "model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
337
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
338
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
339
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
340
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
341
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
342
+ "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
343
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
344
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
345
+ "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
346
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
347
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
348
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
349
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
350
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
351
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
352
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
353
+ "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
354
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
355
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
356
+ "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
357
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
358
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
359
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
360
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
361
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
362
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
363
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
364
+ "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
365
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
366
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
367
+ "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
368
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
369
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
370
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
371
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
372
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
373
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
374
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
375
+ "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
376
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
377
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
378
+ "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
379
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
380
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
381
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
382
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
383
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
384
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
385
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
386
+ "model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
387
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
388
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
389
+ "model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
390
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
391
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
392
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
393
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
394
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
395
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
396
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
397
+ "model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
398
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
399
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
400
+ "model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
401
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
402
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
403
+ "model.norm.weight": "model-00002-of-00002.safetensors"
404
+ }
405
+ }
checkpoint-520/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56effce03357eb73ff86ba34e53d20d383dae22898a61ff66a4e30743b2a99ef
3
+ size 8172200510
checkpoint-520/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b083d9dde1f071db665fabbf1dcb68e9c94f9a61a816d0a9e4515374d00841e8
3
+ size 14512
checkpoint-520/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a89185d5b4e3b18201a6667201b8f8cdd235301ef3dbaafd3091861a6445453d
3
+ size 14512
checkpoint-520/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc5a935e08aec8834f6b14e299594966dcc654a63491495449bef863ea47f388
3
+ size 1064
checkpoint-520/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoint-520/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
checkpoint-520/tokenizer_config.json ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set content = message.content %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in message.content %}\n {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
231
+ "clean_up_tokenization_spaces": false,
232
+ "eos_token": "<|im_end|>",
233
+ "errors": "replace",
234
+ "extra_special_tokens": {},
235
+ "model_max_length": 131072,
236
+ "pad_token": "<|endoftext|>",
237
+ "split_special_tokens": false,
238
+ "tokenizer_class": "Qwen2Tokenizer",
239
+ "unk_token": null
240
+ }
checkpoint-520/trainer_state.json ADDED
@@ -0,0 +1,3746 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 65,
7
+ "global_step": 520,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0019230769230769232,
14
+ "grad_norm": 3.5625,
15
+ "learning_rate": 0.0,
16
+ "loss": 1.1154,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.0019230769230769232,
21
+ "eval_loss": 1.137184739112854,
22
+ "eval_runtime": 34.305,
23
+ "eval_samples_per_second": 68.299,
24
+ "eval_steps_per_second": 17.082,
25
+ "step": 1
26
+ },
27
+ {
28
+ "epoch": 0.0038461538461538464,
29
+ "grad_norm": 3.5,
30
+ "learning_rate": 1.9230769230769234e-07,
31
+ "loss": 1.1559,
32
+ "step": 2
33
+ },
34
+ {
35
+ "epoch": 0.0057692307692307696,
36
+ "grad_norm": 3.515625,
37
+ "learning_rate": 3.846153846153847e-07,
38
+ "loss": 1.1031,
39
+ "step": 3
40
+ },
41
+ {
42
+ "epoch": 0.007692307692307693,
43
+ "grad_norm": 3.6875,
44
+ "learning_rate": 5.76923076923077e-07,
45
+ "loss": 1.1168,
46
+ "step": 4
47
+ },
48
+ {
49
+ "epoch": 0.009615384615384616,
50
+ "grad_norm": 3.78125,
51
+ "learning_rate": 7.692307692307694e-07,
52
+ "loss": 1.0839,
53
+ "step": 5
54
+ },
55
+ {
56
+ "epoch": 0.011538461538461539,
57
+ "grad_norm": 3.546875,
58
+ "learning_rate": 9.615384615384617e-07,
59
+ "loss": 1.0938,
60
+ "step": 6
61
+ },
62
+ {
63
+ "epoch": 0.013461538461538462,
64
+ "grad_norm": 3.71875,
65
+ "learning_rate": 1.153846153846154e-06,
66
+ "loss": 1.1459,
67
+ "step": 7
68
+ },
69
+ {
70
+ "epoch": 0.015384615384615385,
71
+ "grad_norm": 3.671875,
72
+ "learning_rate": 1.3461538461538462e-06,
73
+ "loss": 1.0944,
74
+ "step": 8
75
+ },
76
+ {
77
+ "epoch": 0.01730769230769231,
78
+ "grad_norm": 3.65625,
79
+ "learning_rate": 1.5384615384615387e-06,
80
+ "loss": 1.1185,
81
+ "step": 9
82
+ },
83
+ {
84
+ "epoch": 0.019230769230769232,
85
+ "grad_norm": 3.734375,
86
+ "learning_rate": 1.7307692307692308e-06,
87
+ "loss": 1.0774,
88
+ "step": 10
89
+ },
90
+ {
91
+ "epoch": 0.021153846153846155,
92
+ "grad_norm": 3.46875,
93
+ "learning_rate": 1.9230769230769234e-06,
94
+ "loss": 1.1568,
95
+ "step": 11
96
+ },
97
+ {
98
+ "epoch": 0.023076923076923078,
99
+ "grad_norm": 3.515625,
100
+ "learning_rate": 2.1153846153846155e-06,
101
+ "loss": 1.086,
102
+ "step": 12
103
+ },
104
+ {
105
+ "epoch": 0.025,
106
+ "grad_norm": 3.3125,
107
+ "learning_rate": 2.307692307692308e-06,
108
+ "loss": 1.0978,
109
+ "step": 13
110
+ },
111
+ {
112
+ "epoch": 0.026923076923076925,
113
+ "grad_norm": 3.359375,
114
+ "learning_rate": 2.5e-06,
115
+ "loss": 1.109,
116
+ "step": 14
117
+ },
118
+ {
119
+ "epoch": 0.028846153846153848,
120
+ "grad_norm": 2.953125,
121
+ "learning_rate": 2.6923076923076923e-06,
122
+ "loss": 1.0631,
123
+ "step": 15
124
+ },
125
+ {
126
+ "epoch": 0.03076923076923077,
127
+ "grad_norm": 2.890625,
128
+ "learning_rate": 2.8846153846153845e-06,
129
+ "loss": 1.1241,
130
+ "step": 16
131
+ },
132
+ {
133
+ "epoch": 0.032692307692307694,
134
+ "grad_norm": 2.6875,
135
+ "learning_rate": 3.0769230769230774e-06,
136
+ "loss": 1.0644,
137
+ "step": 17
138
+ },
139
+ {
140
+ "epoch": 0.03461538461538462,
141
+ "grad_norm": 2.640625,
142
+ "learning_rate": 3.2692307692307696e-06,
143
+ "loss": 1.0823,
144
+ "step": 18
145
+ },
146
+ {
147
+ "epoch": 0.03653846153846154,
148
+ "grad_norm": 2.703125,
149
+ "learning_rate": 3.4615384615384617e-06,
150
+ "loss": 1.0494,
151
+ "step": 19
152
+ },
153
+ {
154
+ "epoch": 0.038461538461538464,
155
+ "grad_norm": 2.375,
156
+ "learning_rate": 3.653846153846154e-06,
157
+ "loss": 1.0778,
158
+ "step": 20
159
+ },
160
+ {
161
+ "epoch": 0.04038461538461539,
162
+ "grad_norm": 2.359375,
163
+ "learning_rate": 3.846153846153847e-06,
164
+ "loss": 0.9911,
165
+ "step": 21
166
+ },
167
+ {
168
+ "epoch": 0.04230769230769231,
169
+ "grad_norm": 1.9296875,
170
+ "learning_rate": 4.0384615384615385e-06,
171
+ "loss": 1.1214,
172
+ "step": 22
173
+ },
174
+ {
175
+ "epoch": 0.04423076923076923,
176
+ "grad_norm": 1.859375,
177
+ "learning_rate": 4.230769230769231e-06,
178
+ "loss": 1.0294,
179
+ "step": 23
180
+ },
181
+ {
182
+ "epoch": 0.046153846153846156,
183
+ "grad_norm": 1.6796875,
184
+ "learning_rate": 4.423076923076924e-06,
185
+ "loss": 1.0473,
186
+ "step": 24
187
+ },
188
+ {
189
+ "epoch": 0.04807692307692308,
190
+ "grad_norm": 1.4765625,
191
+ "learning_rate": 4.615384615384616e-06,
192
+ "loss": 1.0562,
193
+ "step": 25
194
+ },
195
+ {
196
+ "epoch": 0.05,
197
+ "grad_norm": 1.4140625,
198
+ "learning_rate": 4.807692307692308e-06,
199
+ "loss": 1.0273,
200
+ "step": 26
201
+ },
202
+ {
203
+ "epoch": 0.051923076923076926,
204
+ "grad_norm": 1.2890625,
205
+ "learning_rate": 5e-06,
206
+ "loss": 1.0713,
207
+ "step": 27
208
+ },
209
+ {
210
+ "epoch": 0.05384615384615385,
211
+ "grad_norm": 1.2578125,
212
+ "learning_rate": 5.192307692307693e-06,
213
+ "loss": 1.0218,
214
+ "step": 28
215
+ },
216
+ {
217
+ "epoch": 0.05576923076923077,
218
+ "grad_norm": 1.2265625,
219
+ "learning_rate": 5.384615384615385e-06,
220
+ "loss": 1.0322,
221
+ "step": 29
222
+ },
223
+ {
224
+ "epoch": 0.057692307692307696,
225
+ "grad_norm": 1.125,
226
+ "learning_rate": 5.576923076923077e-06,
227
+ "loss": 0.9993,
228
+ "step": 30
229
+ },
230
+ {
231
+ "epoch": 0.05961538461538462,
232
+ "grad_norm": 1.1796875,
233
+ "learning_rate": 5.769230769230769e-06,
234
+ "loss": 0.9909,
235
+ "step": 31
236
+ },
237
+ {
238
+ "epoch": 0.06153846153846154,
239
+ "grad_norm": 1.078125,
240
+ "learning_rate": 5.961538461538462e-06,
241
+ "loss": 1.0497,
242
+ "step": 32
243
+ },
244
+ {
245
+ "epoch": 0.06346153846153846,
246
+ "grad_norm": 1.078125,
247
+ "learning_rate": 6.153846153846155e-06,
248
+ "loss": 1.0699,
249
+ "step": 33
250
+ },
251
+ {
252
+ "epoch": 0.06538461538461539,
253
+ "grad_norm": 1.1171875,
254
+ "learning_rate": 6.3461538461538466e-06,
255
+ "loss": 1.0065,
256
+ "step": 34
257
+ },
258
+ {
259
+ "epoch": 0.0673076923076923,
260
+ "grad_norm": 1.03125,
261
+ "learning_rate": 6.538461538461539e-06,
262
+ "loss": 0.9839,
263
+ "step": 35
264
+ },
265
+ {
266
+ "epoch": 0.06923076923076923,
267
+ "grad_norm": 1.421875,
268
+ "learning_rate": 6.730769230769232e-06,
269
+ "loss": 1.0069,
270
+ "step": 36
271
+ },
272
+ {
273
+ "epoch": 0.07115384615384615,
274
+ "grad_norm": 1.0546875,
275
+ "learning_rate": 6.923076923076923e-06,
276
+ "loss": 1.0441,
277
+ "step": 37
278
+ },
279
+ {
280
+ "epoch": 0.07307692307692308,
281
+ "grad_norm": 1.015625,
282
+ "learning_rate": 7.115384615384616e-06,
283
+ "loss": 0.9343,
284
+ "step": 38
285
+ },
286
+ {
287
+ "epoch": 0.075,
288
+ "grad_norm": 1.0234375,
289
+ "learning_rate": 7.307692307692308e-06,
290
+ "loss": 0.9853,
291
+ "step": 39
292
+ },
293
+ {
294
+ "epoch": 0.07692307692307693,
295
+ "grad_norm": 0.98046875,
296
+ "learning_rate": 7.500000000000001e-06,
297
+ "loss": 0.9502,
298
+ "step": 40
299
+ },
300
+ {
301
+ "epoch": 0.07884615384615384,
302
+ "grad_norm": 0.984375,
303
+ "learning_rate": 7.692307692307694e-06,
304
+ "loss": 0.987,
305
+ "step": 41
306
+ },
307
+ {
308
+ "epoch": 0.08076923076923077,
309
+ "grad_norm": 1.0234375,
310
+ "learning_rate": 7.884615384615384e-06,
311
+ "loss": 1.009,
312
+ "step": 42
313
+ },
314
+ {
315
+ "epoch": 0.08269230769230769,
316
+ "grad_norm": 0.9765625,
317
+ "learning_rate": 8.076923076923077e-06,
318
+ "loss": 1.0174,
319
+ "step": 43
320
+ },
321
+ {
322
+ "epoch": 0.08461538461538462,
323
+ "grad_norm": 0.99609375,
324
+ "learning_rate": 8.26923076923077e-06,
325
+ "loss": 1.0515,
326
+ "step": 44
327
+ },
328
+ {
329
+ "epoch": 0.08653846153846154,
330
+ "grad_norm": 1.0234375,
331
+ "learning_rate": 8.461538461538462e-06,
332
+ "loss": 1.0196,
333
+ "step": 45
334
+ },
335
+ {
336
+ "epoch": 0.08846153846153847,
337
+ "grad_norm": 0.94921875,
338
+ "learning_rate": 8.653846153846155e-06,
339
+ "loss": 0.9957,
340
+ "step": 46
341
+ },
342
+ {
343
+ "epoch": 0.09038461538461538,
344
+ "grad_norm": 1.0078125,
345
+ "learning_rate": 8.846153846153847e-06,
346
+ "loss": 0.9422,
347
+ "step": 47
348
+ },
349
+ {
350
+ "epoch": 0.09230769230769231,
351
+ "grad_norm": 0.97265625,
352
+ "learning_rate": 9.03846153846154e-06,
353
+ "loss": 0.9527,
354
+ "step": 48
355
+ },
356
+ {
357
+ "epoch": 0.09423076923076923,
358
+ "grad_norm": 1.015625,
359
+ "learning_rate": 9.230769230769232e-06,
360
+ "loss": 0.9944,
361
+ "step": 49
362
+ },
363
+ {
364
+ "epoch": 0.09615384615384616,
365
+ "grad_norm": 1.0078125,
366
+ "learning_rate": 9.423076923076923e-06,
367
+ "loss": 0.9794,
368
+ "step": 50
369
+ },
370
+ {
371
+ "epoch": 0.09807692307692308,
372
+ "grad_norm": 1.03125,
373
+ "learning_rate": 9.615384615384616e-06,
374
+ "loss": 1.0226,
375
+ "step": 51
376
+ },
377
+ {
378
+ "epoch": 0.1,
379
+ "grad_norm": 1.0,
380
+ "learning_rate": 9.807692307692308e-06,
381
+ "loss": 1.0325,
382
+ "step": 52
383
+ },
384
+ {
385
+ "epoch": 0.10192307692307692,
386
+ "grad_norm": 1.0234375,
387
+ "learning_rate": 1e-05,
388
+ "loss": 0.9893,
389
+ "step": 53
390
+ },
391
+ {
392
+ "epoch": 0.10384615384615385,
393
+ "grad_norm": 0.98046875,
394
+ "learning_rate": 9.999974723001716e-06,
395
+ "loss": 1.0543,
396
+ "step": 54
397
+ },
398
+ {
399
+ "epoch": 0.10576923076923077,
400
+ "grad_norm": 1.03125,
401
+ "learning_rate": 9.999898892262433e-06,
402
+ "loss": 1.0227,
403
+ "step": 55
404
+ },
405
+ {
406
+ "epoch": 0.1076923076923077,
407
+ "grad_norm": 1.046875,
408
+ "learning_rate": 9.999772508548863e-06,
409
+ "loss": 1.067,
410
+ "step": 56
411
+ },
412
+ {
413
+ "epoch": 0.10961538461538461,
414
+ "grad_norm": 1.015625,
415
+ "learning_rate": 9.999595573138845e-06,
416
+ "loss": 0.9794,
417
+ "step": 57
418
+ },
419
+ {
420
+ "epoch": 0.11153846153846154,
421
+ "grad_norm": 1.0625,
422
+ "learning_rate": 9.999368087821337e-06,
423
+ "loss": 1.0166,
424
+ "step": 58
425
+ },
426
+ {
427
+ "epoch": 0.11346153846153846,
428
+ "grad_norm": 1.0546875,
429
+ "learning_rate": 9.999090054896397e-06,
430
+ "loss": 1.0092,
431
+ "step": 59
432
+ },
433
+ {
434
+ "epoch": 0.11538461538461539,
435
+ "grad_norm": 1.015625,
436
+ "learning_rate": 9.99876147717516e-06,
437
+ "loss": 0.9518,
438
+ "step": 60
439
+ },
440
+ {
441
+ "epoch": 0.11730769230769231,
442
+ "grad_norm": 1.0390625,
443
+ "learning_rate": 9.99838235797981e-06,
444
+ "loss": 0.9558,
445
+ "step": 61
446
+ },
447
+ {
448
+ "epoch": 0.11923076923076924,
449
+ "grad_norm": 1.078125,
450
+ "learning_rate": 9.997952701143547e-06,
451
+ "loss": 1.0134,
452
+ "step": 62
453
+ },
454
+ {
455
+ "epoch": 0.12115384615384615,
456
+ "grad_norm": 1.03125,
457
+ "learning_rate": 9.997472511010543e-06,
458
+ "loss": 0.8941,
459
+ "step": 63
460
+ },
461
+ {
462
+ "epoch": 0.12307692307692308,
463
+ "grad_norm": 1.046875,
464
+ "learning_rate": 9.996941792435903e-06,
465
+ "loss": 0.9207,
466
+ "step": 64
467
+ },
468
+ {
469
+ "epoch": 0.125,
470
+ "grad_norm": 1.0625,
471
+ "learning_rate": 9.996360550785619e-06,
472
+ "loss": 0.9351,
473
+ "step": 65
474
+ },
475
+ {
476
+ "epoch": 0.125,
477
+ "eval_loss": 1.0073611736297607,
478
+ "eval_runtime": 34.5277,
479
+ "eval_samples_per_second": 67.859,
480
+ "eval_steps_per_second": 16.972,
481
+ "step": 65
482
+ },
483
+ {
484
+ "epoch": 0.12692307692307692,
485
+ "grad_norm": 1.046875,
486
+ "learning_rate": 9.995728791936505e-06,
487
+ "loss": 1.0024,
488
+ "step": 66
489
+ },
490
+ {
491
+ "epoch": 0.12884615384615383,
492
+ "grad_norm": 1.109375,
493
+ "learning_rate": 9.995046522276152e-06,
494
+ "loss": 1.0015,
495
+ "step": 67
496
+ },
497
+ {
498
+ "epoch": 0.13076923076923078,
499
+ "grad_norm": 1.3359375,
500
+ "learning_rate": 9.994313748702848e-06,
501
+ "loss": 0.9595,
502
+ "step": 68
503
+ },
504
+ {
505
+ "epoch": 0.1326923076923077,
506
+ "grad_norm": 1.0625,
507
+ "learning_rate": 9.993530478625524e-06,
508
+ "loss": 0.9331,
509
+ "step": 69
510
+ },
511
+ {
512
+ "epoch": 0.1346153846153846,
513
+ "grad_norm": 1.0390625,
514
+ "learning_rate": 9.992696719963662e-06,
515
+ "loss": 0.9174,
516
+ "step": 70
517
+ },
518
+ {
519
+ "epoch": 0.13653846153846153,
520
+ "grad_norm": 1.0546875,
521
+ "learning_rate": 9.99181248114723e-06,
522
+ "loss": 1.009,
523
+ "step": 71
524
+ },
525
+ {
526
+ "epoch": 0.13846153846153847,
527
+ "grad_norm": 1.0390625,
528
+ "learning_rate": 9.990877771116588e-06,
529
+ "loss": 0.9661,
530
+ "step": 72
531
+ },
532
+ {
533
+ "epoch": 0.14038461538461539,
534
+ "grad_norm": 1.078125,
535
+ "learning_rate": 9.989892599322404e-06,
536
+ "loss": 0.9398,
537
+ "step": 73
538
+ },
539
+ {
540
+ "epoch": 0.1423076923076923,
541
+ "grad_norm": 1.078125,
542
+ "learning_rate": 9.988856975725551e-06,
543
+ "loss": 0.9973,
544
+ "step": 74
545
+ },
546
+ {
547
+ "epoch": 0.14423076923076922,
548
+ "grad_norm": 1.046875,
549
+ "learning_rate": 9.987770910797014e-06,
550
+ "loss": 0.8935,
551
+ "step": 75
552
+ },
553
+ {
554
+ "epoch": 0.14615384615384616,
555
+ "grad_norm": 1.09375,
556
+ "learning_rate": 9.986634415517774e-06,
557
+ "loss": 0.958,
558
+ "step": 76
559
+ },
560
+ {
561
+ "epoch": 0.14807692307692308,
562
+ "grad_norm": 1.0390625,
563
+ "learning_rate": 9.985447501378706e-06,
564
+ "loss": 0.9349,
565
+ "step": 77
566
+ },
567
+ {
568
+ "epoch": 0.15,
569
+ "grad_norm": 1.078125,
570
+ "learning_rate": 9.984210180380464e-06,
571
+ "loss": 0.9474,
572
+ "step": 78
573
+ },
574
+ {
575
+ "epoch": 0.1519230769230769,
576
+ "grad_norm": 1.0234375,
577
+ "learning_rate": 9.98292246503335e-06,
578
+ "loss": 0.9704,
579
+ "step": 79
580
+ },
581
+ {
582
+ "epoch": 0.15384615384615385,
583
+ "grad_norm": 1.0625,
584
+ "learning_rate": 9.981584368357198e-06,
585
+ "loss": 0.9745,
586
+ "step": 80
587
+ },
588
+ {
589
+ "epoch": 0.15576923076923077,
590
+ "grad_norm": 1.0859375,
591
+ "learning_rate": 9.980195903881231e-06,
592
+ "loss": 0.9527,
593
+ "step": 81
594
+ },
595
+ {
596
+ "epoch": 0.1576923076923077,
597
+ "grad_norm": 1.0859375,
598
+ "learning_rate": 9.978757085643937e-06,
599
+ "loss": 0.9732,
600
+ "step": 82
601
+ },
602
+ {
603
+ "epoch": 0.1596153846153846,
604
+ "grad_norm": 1.09375,
605
+ "learning_rate": 9.97726792819292e-06,
606
+ "loss": 0.9304,
607
+ "step": 83
608
+ },
609
+ {
610
+ "epoch": 0.16153846153846155,
611
+ "grad_norm": 1.03125,
612
+ "learning_rate": 9.975728446584748e-06,
613
+ "loss": 0.9999,
614
+ "step": 84
615
+ },
616
+ {
617
+ "epoch": 0.16346153846153846,
618
+ "grad_norm": 1.0625,
619
+ "learning_rate": 9.974138656384815e-06,
620
+ "loss": 0.9477,
621
+ "step": 85
622
+ },
623
+ {
624
+ "epoch": 0.16538461538461538,
625
+ "grad_norm": 1.0234375,
626
+ "learning_rate": 9.97249857366717e-06,
627
+ "loss": 0.9661,
628
+ "step": 86
629
+ },
630
+ {
631
+ "epoch": 0.1673076923076923,
632
+ "grad_norm": 1.078125,
633
+ "learning_rate": 9.970808215014357e-06,
634
+ "loss": 0.9763,
635
+ "step": 87
636
+ },
637
+ {
638
+ "epoch": 0.16923076923076924,
639
+ "grad_norm": 1.0703125,
640
+ "learning_rate": 9.969067597517255e-06,
641
+ "loss": 0.9292,
642
+ "step": 88
643
+ },
644
+ {
645
+ "epoch": 0.17115384615384616,
646
+ "grad_norm": 1.0625,
647
+ "learning_rate": 9.967276738774897e-06,
648
+ "loss": 0.9083,
649
+ "step": 89
650
+ },
651
+ {
652
+ "epoch": 0.17307692307692307,
653
+ "grad_norm": 1.1328125,
654
+ "learning_rate": 9.9654356568943e-06,
655
+ "loss": 0.9764,
656
+ "step": 90
657
+ },
658
+ {
659
+ "epoch": 0.175,
660
+ "grad_norm": 1.109375,
661
+ "learning_rate": 9.96354437049027e-06,
662
+ "loss": 0.9924,
663
+ "step": 91
664
+ },
665
+ {
666
+ "epoch": 0.17692307692307693,
667
+ "grad_norm": 1.09375,
668
+ "learning_rate": 9.961602898685225e-06,
669
+ "loss": 0.9551,
670
+ "step": 92
671
+ },
672
+ {
673
+ "epoch": 0.17884615384615385,
674
+ "grad_norm": 1.109375,
675
+ "learning_rate": 9.959611261108999e-06,
676
+ "loss": 1.0074,
677
+ "step": 93
678
+ },
679
+ {
680
+ "epoch": 0.18076923076923077,
681
+ "grad_norm": 1.09375,
682
+ "learning_rate": 9.957569477898636e-06,
683
+ "loss": 1.0348,
684
+ "step": 94
685
+ },
686
+ {
687
+ "epoch": 0.18269230769230768,
688
+ "grad_norm": 1.0859375,
689
+ "learning_rate": 9.955477569698197e-06,
690
+ "loss": 0.9745,
691
+ "step": 95
692
+ },
693
+ {
694
+ "epoch": 0.18461538461538463,
695
+ "grad_norm": 1.0625,
696
+ "learning_rate": 9.95333555765855e-06,
697
+ "loss": 0.9278,
698
+ "step": 96
699
+ },
700
+ {
701
+ "epoch": 0.18653846153846154,
702
+ "grad_norm": 1.0859375,
703
+ "learning_rate": 9.951143463437145e-06,
704
+ "loss": 0.9497,
705
+ "step": 97
706
+ },
707
+ {
708
+ "epoch": 0.18846153846153846,
709
+ "grad_norm": 1.109375,
710
+ "learning_rate": 9.948901309197807e-06,
711
+ "loss": 1.0283,
712
+ "step": 98
713
+ },
714
+ {
715
+ "epoch": 0.19038461538461537,
716
+ "grad_norm": 1.078125,
717
+ "learning_rate": 9.946609117610508e-06,
718
+ "loss": 0.9384,
719
+ "step": 99
720
+ },
721
+ {
722
+ "epoch": 0.19230769230769232,
723
+ "grad_norm": 1.0546875,
724
+ "learning_rate": 9.94426691185114e-06,
725
+ "loss": 1.0316,
726
+ "step": 100
727
+ },
728
+ {
729
+ "epoch": 0.19423076923076923,
730
+ "grad_norm": 1.1328125,
731
+ "learning_rate": 9.94187471560127e-06,
732
+ "loss": 0.9401,
733
+ "step": 101
734
+ },
735
+ {
736
+ "epoch": 0.19615384615384615,
737
+ "grad_norm": 1.0703125,
738
+ "learning_rate": 9.939432553047919e-06,
739
+ "loss": 0.9112,
740
+ "step": 102
741
+ },
742
+ {
743
+ "epoch": 0.19807692307692307,
744
+ "grad_norm": 1.0859375,
745
+ "learning_rate": 9.936940448883299e-06,
746
+ "loss": 0.9085,
747
+ "step": 103
748
+ },
749
+ {
750
+ "epoch": 0.2,
751
+ "grad_norm": 1.09375,
752
+ "learning_rate": 9.934398428304577e-06,
753
+ "loss": 0.9583,
754
+ "step": 104
755
+ },
756
+ {
757
+ "epoch": 0.20192307692307693,
758
+ "grad_norm": 1.1171875,
759
+ "learning_rate": 9.931806517013612e-06,
760
+ "loss": 0.9923,
761
+ "step": 105
762
+ },
763
+ {
764
+ "epoch": 0.20384615384615384,
765
+ "grad_norm": 1.09375,
766
+ "learning_rate": 9.929164741216702e-06,
767
+ "loss": 0.9281,
768
+ "step": 106
769
+ },
770
+ {
771
+ "epoch": 0.20576923076923076,
772
+ "grad_norm": 2.03125,
773
+ "learning_rate": 9.926473127624306e-06,
774
+ "loss": 0.9767,
775
+ "step": 107
776
+ },
777
+ {
778
+ "epoch": 0.2076923076923077,
779
+ "grad_norm": 1.0625,
780
+ "learning_rate": 9.923731703450794e-06,
781
+ "loss": 0.9255,
782
+ "step": 108
783
+ },
784
+ {
785
+ "epoch": 0.20961538461538462,
786
+ "grad_norm": 1.109375,
787
+ "learning_rate": 9.920940496414153e-06,
788
+ "loss": 0.9394,
789
+ "step": 109
790
+ },
791
+ {
792
+ "epoch": 0.21153846153846154,
793
+ "grad_norm": 1.0703125,
794
+ "learning_rate": 9.91809953473572e-06,
795
+ "loss": 0.8881,
796
+ "step": 110
797
+ },
798
+ {
799
+ "epoch": 0.21346153846153845,
800
+ "grad_norm": 1.0703125,
801
+ "learning_rate": 9.915208847139883e-06,
802
+ "loss": 0.9327,
803
+ "step": 111
804
+ },
805
+ {
806
+ "epoch": 0.2153846153846154,
807
+ "grad_norm": 1.09375,
808
+ "learning_rate": 9.912268462853811e-06,
809
+ "loss": 0.9637,
810
+ "step": 112
811
+ },
812
+ {
813
+ "epoch": 0.2173076923076923,
814
+ "grad_norm": 1.09375,
815
+ "learning_rate": 9.909278411607134e-06,
816
+ "loss": 0.904,
817
+ "step": 113
818
+ },
819
+ {
820
+ "epoch": 0.21923076923076923,
821
+ "grad_norm": 1.0859375,
822
+ "learning_rate": 9.906238723631662e-06,
823
+ "loss": 1.0109,
824
+ "step": 114
825
+ },
826
+ {
827
+ "epoch": 0.22115384615384615,
828
+ "grad_norm": 1.109375,
829
+ "learning_rate": 9.903149429661072e-06,
830
+ "loss": 0.9612,
831
+ "step": 115
832
+ },
833
+ {
834
+ "epoch": 0.2230769230769231,
835
+ "grad_norm": 1.140625,
836
+ "learning_rate": 9.90001056093059e-06,
837
+ "loss": 0.966,
838
+ "step": 116
839
+ },
840
+ {
841
+ "epoch": 0.225,
842
+ "grad_norm": 1.0859375,
843
+ "learning_rate": 9.896822149176695e-06,
844
+ "loss": 0.9076,
845
+ "step": 117
846
+ },
847
+ {
848
+ "epoch": 0.22692307692307692,
849
+ "grad_norm": 1.1328125,
850
+ "learning_rate": 9.893584226636773e-06,
851
+ "loss": 0.9435,
852
+ "step": 118
853
+ },
854
+ {
855
+ "epoch": 0.22884615384615384,
856
+ "grad_norm": 1.109375,
857
+ "learning_rate": 9.89029682604881e-06,
858
+ "loss": 0.9738,
859
+ "step": 119
860
+ },
861
+ {
862
+ "epoch": 0.23076923076923078,
863
+ "grad_norm": 1.09375,
864
+ "learning_rate": 9.886959980651056e-06,
865
+ "loss": 1.0009,
866
+ "step": 120
867
+ },
868
+ {
869
+ "epoch": 0.2326923076923077,
870
+ "grad_norm": 1.046875,
871
+ "learning_rate": 9.883573724181683e-06,
872
+ "loss": 0.9864,
873
+ "step": 121
874
+ },
875
+ {
876
+ "epoch": 0.23461538461538461,
877
+ "grad_norm": 1.125,
878
+ "learning_rate": 9.880138090878452e-06,
879
+ "loss": 0.9537,
880
+ "step": 122
881
+ },
882
+ {
883
+ "epoch": 0.23653846153846153,
884
+ "grad_norm": 1.09375,
885
+ "learning_rate": 9.87665311547836e-06,
886
+ "loss": 1.0204,
887
+ "step": 123
888
+ },
889
+ {
890
+ "epoch": 0.23846153846153847,
891
+ "grad_norm": 1.1328125,
892
+ "learning_rate": 9.873118833217294e-06,
893
+ "loss": 0.9623,
894
+ "step": 124
895
+ },
896
+ {
897
+ "epoch": 0.2403846153846154,
898
+ "grad_norm": 1.078125,
899
+ "learning_rate": 9.869535279829674e-06,
900
+ "loss": 0.9458,
901
+ "step": 125
902
+ },
903
+ {
904
+ "epoch": 0.2423076923076923,
905
+ "grad_norm": 1.0703125,
906
+ "learning_rate": 9.86590249154809e-06,
907
+ "loss": 0.9223,
908
+ "step": 126
909
+ },
910
+ {
911
+ "epoch": 0.24423076923076922,
912
+ "grad_norm": 1.078125,
913
+ "learning_rate": 9.862220505102933e-06,
914
+ "loss": 0.9847,
915
+ "step": 127
916
+ },
917
+ {
918
+ "epoch": 0.24615384615384617,
919
+ "grad_norm": 1.1015625,
920
+ "learning_rate": 9.858489357722028e-06,
921
+ "loss": 0.9633,
922
+ "step": 128
923
+ },
924
+ {
925
+ "epoch": 0.24807692307692308,
926
+ "grad_norm": 1.1328125,
927
+ "learning_rate": 9.854709087130261e-06,
928
+ "loss": 0.896,
929
+ "step": 129
930
+ },
931
+ {
932
+ "epoch": 0.25,
933
+ "grad_norm": 1.078125,
934
+ "learning_rate": 9.850879731549188e-06,
935
+ "loss": 0.8884,
936
+ "step": 130
937
+ },
938
+ {
939
+ "epoch": 0.25,
940
+ "eval_loss": 0.9758404493331909,
941
+ "eval_runtime": 34.4304,
942
+ "eval_samples_per_second": 68.05,
943
+ "eval_steps_per_second": 17.02,
944
+ "step": 130
945
+ },
946
+ {
947
+ "epoch": 0.2519230769230769,
948
+ "grad_norm": 1.078125,
949
+ "learning_rate": 9.847001329696653e-06,
950
+ "loss": 0.9047,
951
+ "step": 131
952
+ },
953
+ {
954
+ "epoch": 0.25384615384615383,
955
+ "grad_norm": 1.1015625,
956
+ "learning_rate": 9.843073920786402e-06,
957
+ "loss": 0.979,
958
+ "step": 132
959
+ },
960
+ {
961
+ "epoch": 0.25576923076923075,
962
+ "grad_norm": 1.09375,
963
+ "learning_rate": 9.839097544527674e-06,
964
+ "loss": 0.9224,
965
+ "step": 133
966
+ },
967
+ {
968
+ "epoch": 0.25769230769230766,
969
+ "grad_norm": 1.1328125,
970
+ "learning_rate": 9.835072241124815e-06,
971
+ "loss": 0.8739,
972
+ "step": 134
973
+ },
974
+ {
975
+ "epoch": 0.25961538461538464,
976
+ "grad_norm": 1.125,
977
+ "learning_rate": 9.830998051276858e-06,
978
+ "loss": 0.9326,
979
+ "step": 135
980
+ },
981
+ {
982
+ "epoch": 0.26153846153846155,
983
+ "grad_norm": 1.0859375,
984
+ "learning_rate": 9.82687501617712e-06,
985
+ "loss": 0.9303,
986
+ "step": 136
987
+ },
988
+ {
989
+ "epoch": 0.26346153846153847,
990
+ "grad_norm": 1.1171875,
991
+ "learning_rate": 9.822703177512783e-06,
992
+ "loss": 1.0002,
993
+ "step": 137
994
+ },
995
+ {
996
+ "epoch": 0.2653846153846154,
997
+ "grad_norm": 1.125,
998
+ "learning_rate": 9.818482577464466e-06,
999
+ "loss": 0.9562,
1000
+ "step": 138
1001
+ },
1002
+ {
1003
+ "epoch": 0.2673076923076923,
1004
+ "grad_norm": 1.140625,
1005
+ "learning_rate": 9.814213258705813e-06,
1006
+ "loss": 0.9212,
1007
+ "step": 139
1008
+ },
1009
+ {
1010
+ "epoch": 0.2692307692307692,
1011
+ "grad_norm": 1.125,
1012
+ "learning_rate": 9.809895264403046e-06,
1013
+ "loss": 0.9679,
1014
+ "step": 140
1015
+ },
1016
+ {
1017
+ "epoch": 0.27115384615384613,
1018
+ "grad_norm": 1.1015625,
1019
+ "learning_rate": 9.805528638214543e-06,
1020
+ "loss": 0.9903,
1021
+ "step": 141
1022
+ },
1023
+ {
1024
+ "epoch": 0.27307692307692305,
1025
+ "grad_norm": 1.1171875,
1026
+ "learning_rate": 9.801113424290381e-06,
1027
+ "loss": 0.9754,
1028
+ "step": 142
1029
+ },
1030
+ {
1031
+ "epoch": 0.275,
1032
+ "grad_norm": 1.125,
1033
+ "learning_rate": 9.796649667271905e-06,
1034
+ "loss": 0.8977,
1035
+ "step": 143
1036
+ },
1037
+ {
1038
+ "epoch": 0.27692307692307694,
1039
+ "grad_norm": 1.109375,
1040
+ "learning_rate": 9.792137412291265e-06,
1041
+ "loss": 0.8798,
1042
+ "step": 144
1043
+ },
1044
+ {
1045
+ "epoch": 0.27884615384615385,
1046
+ "grad_norm": 1.15625,
1047
+ "learning_rate": 9.787576704970965e-06,
1048
+ "loss": 0.9382,
1049
+ "step": 145
1050
+ },
1051
+ {
1052
+ "epoch": 0.28076923076923077,
1053
+ "grad_norm": 1.1796875,
1054
+ "learning_rate": 9.7829675914234e-06,
1055
+ "loss": 0.9692,
1056
+ "step": 146
1057
+ },
1058
+ {
1059
+ "epoch": 0.2826923076923077,
1060
+ "grad_norm": 1.125,
1061
+ "learning_rate": 9.778310118250397e-06,
1062
+ "loss": 0.8939,
1063
+ "step": 147
1064
+ },
1065
+ {
1066
+ "epoch": 0.2846153846153846,
1067
+ "grad_norm": 1.1640625,
1068
+ "learning_rate": 9.77360433254273e-06,
1069
+ "loss": 1.0047,
1070
+ "step": 148
1071
+ },
1072
+ {
1073
+ "epoch": 0.2865384615384615,
1074
+ "grad_norm": 1.1328125,
1075
+ "learning_rate": 9.768850281879651e-06,
1076
+ "loss": 0.9484,
1077
+ "step": 149
1078
+ },
1079
+ {
1080
+ "epoch": 0.28846153846153844,
1081
+ "grad_norm": 1.1171875,
1082
+ "learning_rate": 9.764048014328417e-06,
1083
+ "loss": 0.9004,
1084
+ "step": 150
1085
+ },
1086
+ {
1087
+ "epoch": 0.2903846153846154,
1088
+ "grad_norm": 1.1171875,
1089
+ "learning_rate": 9.759197578443787e-06,
1090
+ "loss": 0.9281,
1091
+ "step": 151
1092
+ },
1093
+ {
1094
+ "epoch": 0.2923076923076923,
1095
+ "grad_norm": 1.2109375,
1096
+ "learning_rate": 9.754299023267548e-06,
1097
+ "loss": 0.9638,
1098
+ "step": 152
1099
+ },
1100
+ {
1101
+ "epoch": 0.29423076923076924,
1102
+ "grad_norm": 1.1640625,
1103
+ "learning_rate": 9.74935239832801e-06,
1104
+ "loss": 0.9366,
1105
+ "step": 153
1106
+ },
1107
+ {
1108
+ "epoch": 0.29615384615384616,
1109
+ "grad_norm": 1.1796875,
1110
+ "learning_rate": 9.7443577536395e-06,
1111
+ "loss": 0.9125,
1112
+ "step": 154
1113
+ },
1114
+ {
1115
+ "epoch": 0.2980769230769231,
1116
+ "grad_norm": 1.125,
1117
+ "learning_rate": 9.739315139701868e-06,
1118
+ "loss": 0.9226,
1119
+ "step": 155
1120
+ },
1121
+ {
1122
+ "epoch": 0.3,
1123
+ "grad_norm": 1.1484375,
1124
+ "learning_rate": 9.734224607499978e-06,
1125
+ "loss": 0.9384,
1126
+ "step": 156
1127
+ },
1128
+ {
1129
+ "epoch": 0.3019230769230769,
1130
+ "grad_norm": 1.2890625,
1131
+ "learning_rate": 9.729086208503174e-06,
1132
+ "loss": 0.8788,
1133
+ "step": 157
1134
+ },
1135
+ {
1136
+ "epoch": 0.3038461538461538,
1137
+ "grad_norm": 1.15625,
1138
+ "learning_rate": 9.723899994664779e-06,
1139
+ "loss": 0.9672,
1140
+ "step": 158
1141
+ },
1142
+ {
1143
+ "epoch": 0.3057692307692308,
1144
+ "grad_norm": 1.125,
1145
+ "learning_rate": 9.71866601842156e-06,
1146
+ "loss": 0.8889,
1147
+ "step": 159
1148
+ },
1149
+ {
1150
+ "epoch": 0.3076923076923077,
1151
+ "grad_norm": 1.15625,
1152
+ "learning_rate": 9.713384332693199e-06,
1153
+ "loss": 0.8975,
1154
+ "step": 160
1155
+ },
1156
+ {
1157
+ "epoch": 0.3096153846153846,
1158
+ "grad_norm": 1.1015625,
1159
+ "learning_rate": 9.708054990881763e-06,
1160
+ "loss": 0.9614,
1161
+ "step": 161
1162
+ },
1163
+ {
1164
+ "epoch": 0.31153846153846154,
1165
+ "grad_norm": 1.125,
1166
+ "learning_rate": 9.702678046871157e-06,
1167
+ "loss": 0.9061,
1168
+ "step": 162
1169
+ },
1170
+ {
1171
+ "epoch": 0.31346153846153846,
1172
+ "grad_norm": 1.1484375,
1173
+ "learning_rate": 9.69725355502658e-06,
1174
+ "loss": 0.9322,
1175
+ "step": 163
1176
+ },
1177
+ {
1178
+ "epoch": 0.3153846153846154,
1179
+ "grad_norm": 1.15625,
1180
+ "learning_rate": 9.691781570193983e-06,
1181
+ "loss": 0.9512,
1182
+ "step": 164
1183
+ },
1184
+ {
1185
+ "epoch": 0.3173076923076923,
1186
+ "grad_norm": 1.1328125,
1187
+ "learning_rate": 9.686262147699507e-06,
1188
+ "loss": 1.0271,
1189
+ "step": 165
1190
+ },
1191
+ {
1192
+ "epoch": 0.3192307692307692,
1193
+ "grad_norm": 1.125,
1194
+ "learning_rate": 9.680695343348923e-06,
1195
+ "loss": 0.9529,
1196
+ "step": 166
1197
+ },
1198
+ {
1199
+ "epoch": 0.3211538461538462,
1200
+ "grad_norm": 1.203125,
1201
+ "learning_rate": 9.675081213427076e-06,
1202
+ "loss": 0.9636,
1203
+ "step": 167
1204
+ },
1205
+ {
1206
+ "epoch": 0.3230769230769231,
1207
+ "grad_norm": 1.109375,
1208
+ "learning_rate": 9.669419814697303e-06,
1209
+ "loss": 0.8879,
1210
+ "step": 168
1211
+ },
1212
+ {
1213
+ "epoch": 0.325,
1214
+ "grad_norm": 1.125,
1215
+ "learning_rate": 9.663711204400872e-06,
1216
+ "loss": 0.8889,
1217
+ "step": 169
1218
+ },
1219
+ {
1220
+ "epoch": 0.3269230769230769,
1221
+ "grad_norm": 1.0546875,
1222
+ "learning_rate": 9.657955440256396e-06,
1223
+ "loss": 0.9735,
1224
+ "step": 170
1225
+ },
1226
+ {
1227
+ "epoch": 0.32884615384615384,
1228
+ "grad_norm": 1.1015625,
1229
+ "learning_rate": 9.65215258045925e-06,
1230
+ "loss": 1.0088,
1231
+ "step": 171
1232
+ },
1233
+ {
1234
+ "epoch": 0.33076923076923076,
1235
+ "grad_norm": 1.1015625,
1236
+ "learning_rate": 9.64630268368099e-06,
1237
+ "loss": 0.9318,
1238
+ "step": 172
1239
+ },
1240
+ {
1241
+ "epoch": 0.3326923076923077,
1242
+ "grad_norm": 1.09375,
1243
+ "learning_rate": 9.640405809068743e-06,
1244
+ "loss": 0.9902,
1245
+ "step": 173
1246
+ },
1247
+ {
1248
+ "epoch": 0.3346153846153846,
1249
+ "grad_norm": 1.109375,
1250
+ "learning_rate": 9.634462016244625e-06,
1251
+ "loss": 0.9315,
1252
+ "step": 174
1253
+ },
1254
+ {
1255
+ "epoch": 0.33653846153846156,
1256
+ "grad_norm": 1.0546875,
1257
+ "learning_rate": 9.628471365305134e-06,
1258
+ "loss": 0.931,
1259
+ "step": 175
1260
+ },
1261
+ {
1262
+ "epoch": 0.3384615384615385,
1263
+ "grad_norm": 1.1015625,
1264
+ "learning_rate": 9.622433916820539e-06,
1265
+ "loss": 0.9167,
1266
+ "step": 176
1267
+ },
1268
+ {
1269
+ "epoch": 0.3403846153846154,
1270
+ "grad_norm": 1.109375,
1271
+ "learning_rate": 9.616349731834271e-06,
1272
+ "loss": 0.9214,
1273
+ "step": 177
1274
+ },
1275
+ {
1276
+ "epoch": 0.3423076923076923,
1277
+ "grad_norm": 1.1171875,
1278
+ "learning_rate": 9.610218871862303e-06,
1279
+ "loss": 0.945,
1280
+ "step": 178
1281
+ },
1282
+ {
1283
+ "epoch": 0.34423076923076923,
1284
+ "grad_norm": 1.1171875,
1285
+ "learning_rate": 9.604041398892528e-06,
1286
+ "loss": 0.9445,
1287
+ "step": 179
1288
+ },
1289
+ {
1290
+ "epoch": 0.34615384615384615,
1291
+ "grad_norm": 1.140625,
1292
+ "learning_rate": 9.597817375384138e-06,
1293
+ "loss": 1.0135,
1294
+ "step": 180
1295
+ },
1296
+ {
1297
+ "epoch": 0.34807692307692306,
1298
+ "grad_norm": 1.3203125,
1299
+ "learning_rate": 9.591546864266983e-06,
1300
+ "loss": 0.9393,
1301
+ "step": 181
1302
+ },
1303
+ {
1304
+ "epoch": 0.35,
1305
+ "grad_norm": 1.1171875,
1306
+ "learning_rate": 9.585229928940944e-06,
1307
+ "loss": 0.9273,
1308
+ "step": 182
1309
+ },
1310
+ {
1311
+ "epoch": 0.35192307692307695,
1312
+ "grad_norm": 1.1171875,
1313
+ "learning_rate": 9.578866633275289e-06,
1314
+ "loss": 1.0091,
1315
+ "step": 183
1316
+ },
1317
+ {
1318
+ "epoch": 0.35384615384615387,
1319
+ "grad_norm": 1.0859375,
1320
+ "learning_rate": 9.572457041608018e-06,
1321
+ "loss": 0.9301,
1322
+ "step": 184
1323
+ },
1324
+ {
1325
+ "epoch": 0.3557692307692308,
1326
+ "grad_norm": 1.046875,
1327
+ "learning_rate": 9.56600121874523e-06,
1328
+ "loss": 0.991,
1329
+ "step": 185
1330
+ },
1331
+ {
1332
+ "epoch": 0.3576923076923077,
1333
+ "grad_norm": 1.125,
1334
+ "learning_rate": 9.55949922996045e-06,
1335
+ "loss": 0.8897,
1336
+ "step": 186
1337
+ },
1338
+ {
1339
+ "epoch": 0.3596153846153846,
1340
+ "grad_norm": 1.109375,
1341
+ "learning_rate": 9.55295114099399e-06,
1342
+ "loss": 0.8844,
1343
+ "step": 187
1344
+ },
1345
+ {
1346
+ "epoch": 0.36153846153846153,
1347
+ "grad_norm": 1.0703125,
1348
+ "learning_rate": 9.546357018052254e-06,
1349
+ "loss": 0.9226,
1350
+ "step": 188
1351
+ },
1352
+ {
1353
+ "epoch": 0.36346153846153845,
1354
+ "grad_norm": 1.078125,
1355
+ "learning_rate": 9.539716927807102e-06,
1356
+ "loss": 0.9273,
1357
+ "step": 189
1358
+ },
1359
+ {
1360
+ "epoch": 0.36538461538461536,
1361
+ "grad_norm": 1.109375,
1362
+ "learning_rate": 9.533030937395151e-06,
1363
+ "loss": 0.9116,
1364
+ "step": 190
1365
+ },
1366
+ {
1367
+ "epoch": 0.36730769230769234,
1368
+ "grad_norm": 1.0703125,
1369
+ "learning_rate": 9.526299114417108e-06,
1370
+ "loss": 0.9732,
1371
+ "step": 191
1372
+ },
1373
+ {
1374
+ "epoch": 0.36923076923076925,
1375
+ "grad_norm": 1.078125,
1376
+ "learning_rate": 9.519521526937087e-06,
1377
+ "loss": 0.918,
1378
+ "step": 192
1379
+ },
1380
+ {
1381
+ "epoch": 0.37115384615384617,
1382
+ "grad_norm": 1.03125,
1383
+ "learning_rate": 9.512698243481914e-06,
1384
+ "loss": 0.9045,
1385
+ "step": 193
1386
+ },
1387
+ {
1388
+ "epoch": 0.3730769230769231,
1389
+ "grad_norm": 1.078125,
1390
+ "learning_rate": 9.505829333040437e-06,
1391
+ "loss": 0.9189,
1392
+ "step": 194
1393
+ },
1394
+ {
1395
+ "epoch": 0.375,
1396
+ "grad_norm": 1.1015625,
1397
+ "learning_rate": 9.498914865062831e-06,
1398
+ "loss": 0.9853,
1399
+ "step": 195
1400
+ },
1401
+ {
1402
+ "epoch": 0.375,
1403
+ "eval_loss": 0.9608431458473206,
1404
+ "eval_runtime": 34.5143,
1405
+ "eval_samples_per_second": 67.885,
1406
+ "eval_steps_per_second": 16.978,
1407
+ "step": 195
1408
+ },
1409
+ {
1410
+ "epoch": 0.3769230769230769,
1411
+ "grad_norm": 1.046875,
1412
+ "learning_rate": 9.491954909459895e-06,
1413
+ "loss": 0.9078,
1414
+ "step": 196
1415
+ },
1416
+ {
1417
+ "epoch": 0.37884615384615383,
1418
+ "grad_norm": 1.0703125,
1419
+ "learning_rate": 9.484949536602343e-06,
1420
+ "loss": 0.8859,
1421
+ "step": 197
1422
+ },
1423
+ {
1424
+ "epoch": 0.38076923076923075,
1425
+ "grad_norm": 1.0390625,
1426
+ "learning_rate": 9.477898817320094e-06,
1427
+ "loss": 0.9183,
1428
+ "step": 198
1429
+ },
1430
+ {
1431
+ "epoch": 0.38269230769230766,
1432
+ "grad_norm": 1.0390625,
1433
+ "learning_rate": 9.470802822901558e-06,
1434
+ "loss": 0.918,
1435
+ "step": 199
1436
+ },
1437
+ {
1438
+ "epoch": 0.38461538461538464,
1439
+ "grad_norm": 1.0625,
1440
+ "learning_rate": 9.463661625092907e-06,
1441
+ "loss": 0.9444,
1442
+ "step": 200
1443
+ },
1444
+ {
1445
+ "epoch": 0.38653846153846155,
1446
+ "grad_norm": 1.046875,
1447
+ "learning_rate": 9.45647529609736e-06,
1448
+ "loss": 0.8958,
1449
+ "step": 201
1450
+ },
1451
+ {
1452
+ "epoch": 0.38846153846153847,
1453
+ "grad_norm": 1.046875,
1454
+ "learning_rate": 9.44924390857445e-06,
1455
+ "loss": 0.9071,
1456
+ "step": 202
1457
+ },
1458
+ {
1459
+ "epoch": 0.3903846153846154,
1460
+ "grad_norm": 1.09375,
1461
+ "learning_rate": 9.44196753563928e-06,
1462
+ "loss": 0.962,
1463
+ "step": 203
1464
+ },
1465
+ {
1466
+ "epoch": 0.3923076923076923,
1467
+ "grad_norm": 1.0546875,
1468
+ "learning_rate": 9.434646250861801e-06,
1469
+ "loss": 0.9814,
1470
+ "step": 204
1471
+ },
1472
+ {
1473
+ "epoch": 0.3942307692307692,
1474
+ "grad_norm": 1.09375,
1475
+ "learning_rate": 9.427280128266049e-06,
1476
+ "loss": 0.9873,
1477
+ "step": 205
1478
+ },
1479
+ {
1480
+ "epoch": 0.39615384615384613,
1481
+ "grad_norm": 1.15625,
1482
+ "learning_rate": 9.419869242329417e-06,
1483
+ "loss": 0.9034,
1484
+ "step": 206
1485
+ },
1486
+ {
1487
+ "epoch": 0.39807692307692305,
1488
+ "grad_norm": 1.0703125,
1489
+ "learning_rate": 9.412413667981884e-06,
1490
+ "loss": 0.8953,
1491
+ "step": 207
1492
+ },
1493
+ {
1494
+ "epoch": 0.4,
1495
+ "grad_norm": 1.1015625,
1496
+ "learning_rate": 9.404913480605264e-06,
1497
+ "loss": 1.0005,
1498
+ "step": 208
1499
+ },
1500
+ {
1501
+ "epoch": 0.40192307692307694,
1502
+ "grad_norm": 1.0625,
1503
+ "learning_rate": 9.397368756032445e-06,
1504
+ "loss": 0.9455,
1505
+ "step": 209
1506
+ },
1507
+ {
1508
+ "epoch": 0.40384615384615385,
1509
+ "grad_norm": 1.0234375,
1510
+ "learning_rate": 9.389779570546628e-06,
1511
+ "loss": 0.9336,
1512
+ "step": 210
1513
+ },
1514
+ {
1515
+ "epoch": 0.40576923076923077,
1516
+ "grad_norm": 1.1953125,
1517
+ "learning_rate": 9.38214600088054e-06,
1518
+ "loss": 0.8907,
1519
+ "step": 211
1520
+ },
1521
+ {
1522
+ "epoch": 0.4076923076923077,
1523
+ "grad_norm": 1.078125,
1524
+ "learning_rate": 9.374468124215676e-06,
1525
+ "loss": 0.8735,
1526
+ "step": 212
1527
+ },
1528
+ {
1529
+ "epoch": 0.4096153846153846,
1530
+ "grad_norm": 1.0625,
1531
+ "learning_rate": 9.366746018181503e-06,
1532
+ "loss": 0.9682,
1533
+ "step": 213
1534
+ },
1535
+ {
1536
+ "epoch": 0.4115384615384615,
1537
+ "grad_norm": 1.0078125,
1538
+ "learning_rate": 9.358979760854686e-06,
1539
+ "loss": 0.9069,
1540
+ "step": 214
1541
+ },
1542
+ {
1543
+ "epoch": 0.41346153846153844,
1544
+ "grad_norm": 0.99609375,
1545
+ "learning_rate": 9.351169430758293e-06,
1546
+ "loss": 0.9371,
1547
+ "step": 215
1548
+ },
1549
+ {
1550
+ "epoch": 0.4153846153846154,
1551
+ "grad_norm": 1.0703125,
1552
+ "learning_rate": 9.343315106861008e-06,
1553
+ "loss": 0.9691,
1554
+ "step": 216
1555
+ },
1556
+ {
1557
+ "epoch": 0.4173076923076923,
1558
+ "grad_norm": 1.0234375,
1559
+ "learning_rate": 9.33541686857632e-06,
1560
+ "loss": 0.9521,
1561
+ "step": 217
1562
+ },
1563
+ {
1564
+ "epoch": 0.41923076923076924,
1565
+ "grad_norm": 1.0078125,
1566
+ "learning_rate": 9.327474795761734e-06,
1567
+ "loss": 0.9503,
1568
+ "step": 218
1569
+ },
1570
+ {
1571
+ "epoch": 0.42115384615384616,
1572
+ "grad_norm": 1.015625,
1573
+ "learning_rate": 9.31948896871795e-06,
1574
+ "loss": 0.9311,
1575
+ "step": 219
1576
+ },
1577
+ {
1578
+ "epoch": 0.4230769230769231,
1579
+ "grad_norm": 1.046875,
1580
+ "learning_rate": 9.311459468188066e-06,
1581
+ "loss": 0.8998,
1582
+ "step": 220
1583
+ },
1584
+ {
1585
+ "epoch": 0.425,
1586
+ "grad_norm": 1.0859375,
1587
+ "learning_rate": 9.303386375356752e-06,
1588
+ "loss": 0.8776,
1589
+ "step": 221
1590
+ },
1591
+ {
1592
+ "epoch": 0.4269230769230769,
1593
+ "grad_norm": 1.0703125,
1594
+ "learning_rate": 9.295269771849426e-06,
1595
+ "loss": 0.9003,
1596
+ "step": 222
1597
+ },
1598
+ {
1599
+ "epoch": 0.4288461538461538,
1600
+ "grad_norm": 0.98046875,
1601
+ "learning_rate": 9.28710973973144e-06,
1602
+ "loss": 0.9597,
1603
+ "step": 223
1604
+ },
1605
+ {
1606
+ "epoch": 0.4307692307692308,
1607
+ "grad_norm": 0.96875,
1608
+ "learning_rate": 9.278906361507238e-06,
1609
+ "loss": 0.8842,
1610
+ "step": 224
1611
+ },
1612
+ {
1613
+ "epoch": 0.4326923076923077,
1614
+ "grad_norm": 1.0078125,
1615
+ "learning_rate": 9.270659720119533e-06,
1616
+ "loss": 0.9524,
1617
+ "step": 225
1618
+ },
1619
+ {
1620
+ "epoch": 0.4346153846153846,
1621
+ "grad_norm": 1.0625,
1622
+ "learning_rate": 9.262369898948462e-06,
1623
+ "loss": 0.9271,
1624
+ "step": 226
1625
+ },
1626
+ {
1627
+ "epoch": 0.43653846153846154,
1628
+ "grad_norm": 1.078125,
1629
+ "learning_rate": 9.254036981810741e-06,
1630
+ "loss": 0.9627,
1631
+ "step": 227
1632
+ },
1633
+ {
1634
+ "epoch": 0.43846153846153846,
1635
+ "grad_norm": 1.015625,
1636
+ "learning_rate": 9.245661052958823e-06,
1637
+ "loss": 0.9305,
1638
+ "step": 228
1639
+ },
1640
+ {
1641
+ "epoch": 0.4403846153846154,
1642
+ "grad_norm": 1.03125,
1643
+ "learning_rate": 9.237242197080045e-06,
1644
+ "loss": 0.9855,
1645
+ "step": 229
1646
+ },
1647
+ {
1648
+ "epoch": 0.4423076923076923,
1649
+ "grad_norm": 1.0390625,
1650
+ "learning_rate": 9.22878049929577e-06,
1651
+ "loss": 0.9237,
1652
+ "step": 230
1653
+ },
1654
+ {
1655
+ "epoch": 0.4442307692307692,
1656
+ "grad_norm": 1.0078125,
1657
+ "learning_rate": 9.220276045160524e-06,
1658
+ "loss": 0.9733,
1659
+ "step": 231
1660
+ },
1661
+ {
1662
+ "epoch": 0.4461538461538462,
1663
+ "grad_norm": 0.9765625,
1664
+ "learning_rate": 9.211728920661136e-06,
1665
+ "loss": 0.9613,
1666
+ "step": 232
1667
+ },
1668
+ {
1669
+ "epoch": 0.4480769230769231,
1670
+ "grad_norm": 0.9609375,
1671
+ "learning_rate": 9.203139212215868e-06,
1672
+ "loss": 0.9317,
1673
+ "step": 233
1674
+ },
1675
+ {
1676
+ "epoch": 0.45,
1677
+ "grad_norm": 0.99609375,
1678
+ "learning_rate": 9.19450700667354e-06,
1679
+ "loss": 0.9067,
1680
+ "step": 234
1681
+ },
1682
+ {
1683
+ "epoch": 0.4519230769230769,
1684
+ "grad_norm": 0.9296875,
1685
+ "learning_rate": 9.185832391312644e-06,
1686
+ "loss": 0.9088,
1687
+ "step": 235
1688
+ },
1689
+ {
1690
+ "epoch": 0.45384615384615384,
1691
+ "grad_norm": 0.953125,
1692
+ "learning_rate": 9.17711545384048e-06,
1693
+ "loss": 0.9359,
1694
+ "step": 236
1695
+ },
1696
+ {
1697
+ "epoch": 0.45576923076923076,
1698
+ "grad_norm": 1.015625,
1699
+ "learning_rate": 9.168356282392253e-06,
1700
+ "loss": 0.96,
1701
+ "step": 237
1702
+ },
1703
+ {
1704
+ "epoch": 0.4576923076923077,
1705
+ "grad_norm": 0.90234375,
1706
+ "learning_rate": 9.159554965530184e-06,
1707
+ "loss": 0.9193,
1708
+ "step": 238
1709
+ },
1710
+ {
1711
+ "epoch": 0.4596153846153846,
1712
+ "grad_norm": 0.93359375,
1713
+ "learning_rate": 9.150711592242627e-06,
1714
+ "loss": 0.9439,
1715
+ "step": 239
1716
+ },
1717
+ {
1718
+ "epoch": 0.46153846153846156,
1719
+ "grad_norm": 1.0078125,
1720
+ "learning_rate": 9.14182625194315e-06,
1721
+ "loss": 0.8804,
1722
+ "step": 240
1723
+ },
1724
+ {
1725
+ "epoch": 0.4634615384615385,
1726
+ "grad_norm": 0.9375,
1727
+ "learning_rate": 9.132899034469648e-06,
1728
+ "loss": 0.916,
1729
+ "step": 241
1730
+ },
1731
+ {
1732
+ "epoch": 0.4653846153846154,
1733
+ "grad_norm": 0.98828125,
1734
+ "learning_rate": 9.123930030083425e-06,
1735
+ "loss": 0.8793,
1736
+ "step": 242
1737
+ },
1738
+ {
1739
+ "epoch": 0.4673076923076923,
1740
+ "grad_norm": 0.93359375,
1741
+ "learning_rate": 9.114919329468283e-06,
1742
+ "loss": 0.9162,
1743
+ "step": 243
1744
+ },
1745
+ {
1746
+ "epoch": 0.46923076923076923,
1747
+ "grad_norm": 0.98046875,
1748
+ "learning_rate": 9.10586702372961e-06,
1749
+ "loss": 0.9098,
1750
+ "step": 244
1751
+ },
1752
+ {
1753
+ "epoch": 0.47115384615384615,
1754
+ "grad_norm": 0.91015625,
1755
+ "learning_rate": 9.09677320439345e-06,
1756
+ "loss": 0.9219,
1757
+ "step": 245
1758
+ },
1759
+ {
1760
+ "epoch": 0.47307692307692306,
1761
+ "grad_norm": 0.921875,
1762
+ "learning_rate": 9.087637963405586e-06,
1763
+ "loss": 0.9554,
1764
+ "step": 246
1765
+ },
1766
+ {
1767
+ "epoch": 0.475,
1768
+ "grad_norm": 0.9140625,
1769
+ "learning_rate": 9.07846139313061e-06,
1770
+ "loss": 0.9994,
1771
+ "step": 247
1772
+ },
1773
+ {
1774
+ "epoch": 0.47692307692307695,
1775
+ "grad_norm": 0.96875,
1776
+ "learning_rate": 9.069243586350976e-06,
1777
+ "loss": 0.9052,
1778
+ "step": 248
1779
+ },
1780
+ {
1781
+ "epoch": 0.47884615384615387,
1782
+ "grad_norm": 0.94921875,
1783
+ "learning_rate": 9.059984636266082e-06,
1784
+ "loss": 0.9232,
1785
+ "step": 249
1786
+ },
1787
+ {
1788
+ "epoch": 0.4807692307692308,
1789
+ "grad_norm": 0.97265625,
1790
+ "learning_rate": 9.050684636491317e-06,
1791
+ "loss": 0.8964,
1792
+ "step": 250
1793
+ },
1794
+ {
1795
+ "epoch": 0.4826923076923077,
1796
+ "grad_norm": 0.91015625,
1797
+ "learning_rate": 9.041343681057106e-06,
1798
+ "loss": 0.9386,
1799
+ "step": 251
1800
+ },
1801
+ {
1802
+ "epoch": 0.4846153846153846,
1803
+ "grad_norm": 0.8984375,
1804
+ "learning_rate": 9.03196186440798e-06,
1805
+ "loss": 0.9078,
1806
+ "step": 252
1807
+ },
1808
+ {
1809
+ "epoch": 0.48653846153846153,
1810
+ "grad_norm": 0.96875,
1811
+ "learning_rate": 9.022539281401601e-06,
1812
+ "loss": 0.9056,
1813
+ "step": 253
1814
+ },
1815
+ {
1816
+ "epoch": 0.48846153846153845,
1817
+ "grad_norm": 0.8671875,
1818
+ "learning_rate": 9.013076027307817e-06,
1819
+ "loss": 0.8973,
1820
+ "step": 254
1821
+ },
1822
+ {
1823
+ "epoch": 0.49038461538461536,
1824
+ "grad_norm": 0.921875,
1825
+ "learning_rate": 9.00357219780769e-06,
1826
+ "loss": 0.9663,
1827
+ "step": 255
1828
+ },
1829
+ {
1830
+ "epoch": 0.49230769230769234,
1831
+ "grad_norm": 0.90234375,
1832
+ "learning_rate": 8.994027888992533e-06,
1833
+ "loss": 0.8857,
1834
+ "step": 256
1835
+ },
1836
+ {
1837
+ "epoch": 0.49423076923076925,
1838
+ "grad_norm": 0.9921875,
1839
+ "learning_rate": 8.984443197362938e-06,
1840
+ "loss": 0.9815,
1841
+ "step": 257
1842
+ },
1843
+ {
1844
+ "epoch": 0.49615384615384617,
1845
+ "grad_norm": 0.91015625,
1846
+ "learning_rate": 8.974818219827796e-06,
1847
+ "loss": 0.9693,
1848
+ "step": 258
1849
+ },
1850
+ {
1851
+ "epoch": 0.4980769230769231,
1852
+ "grad_norm": 1.2265625,
1853
+ "learning_rate": 8.965153053703325e-06,
1854
+ "loss": 0.8971,
1855
+ "step": 259
1856
+ },
1857
+ {
1858
+ "epoch": 0.5,
1859
+ "grad_norm": 0.94140625,
1860
+ "learning_rate": 8.955447796712083e-06,
1861
+ "loss": 0.8998,
1862
+ "step": 260
1863
+ },
1864
+ {
1865
+ "epoch": 0.5,
1866
+ "eval_loss": 0.9489682912826538,
1867
+ "eval_runtime": 34.8214,
1868
+ "eval_samples_per_second": 67.286,
1869
+ "eval_steps_per_second": 16.829,
1870
+ "step": 260
1871
+ },
1872
+ {
1873
+ "epoch": 0.5019230769230769,
1874
+ "grad_norm": 0.9609375,
1875
+ "learning_rate": 8.94570254698197e-06,
1876
+ "loss": 0.9153,
1877
+ "step": 261
1878
+ },
1879
+ {
1880
+ "epoch": 0.5038461538461538,
1881
+ "grad_norm": 0.94140625,
1882
+ "learning_rate": 8.935917403045251e-06,
1883
+ "loss": 0.9449,
1884
+ "step": 262
1885
+ },
1886
+ {
1887
+ "epoch": 0.5057692307692307,
1888
+ "grad_norm": 0.91796875,
1889
+ "learning_rate": 8.926092463837557e-06,
1890
+ "loss": 0.9087,
1891
+ "step": 263
1892
+ },
1893
+ {
1894
+ "epoch": 0.5076923076923077,
1895
+ "grad_norm": 0.890625,
1896
+ "learning_rate": 8.916227828696873e-06,
1897
+ "loss": 0.8946,
1898
+ "step": 264
1899
+ },
1900
+ {
1901
+ "epoch": 0.5096153846153846,
1902
+ "grad_norm": 0.8671875,
1903
+ "learning_rate": 8.906323597362547e-06,
1904
+ "loss": 0.9261,
1905
+ "step": 265
1906
+ },
1907
+ {
1908
+ "epoch": 0.5115384615384615,
1909
+ "grad_norm": 0.8828125,
1910
+ "learning_rate": 8.896379869974273e-06,
1911
+ "loss": 0.8826,
1912
+ "step": 266
1913
+ },
1914
+ {
1915
+ "epoch": 0.5134615384615384,
1916
+ "grad_norm": 0.9609375,
1917
+ "learning_rate": 8.886396747071085e-06,
1918
+ "loss": 0.9224,
1919
+ "step": 267
1920
+ },
1921
+ {
1922
+ "epoch": 0.5153846153846153,
1923
+ "grad_norm": 0.84765625,
1924
+ "learning_rate": 8.876374329590331e-06,
1925
+ "loss": 0.8849,
1926
+ "step": 268
1927
+ },
1928
+ {
1929
+ "epoch": 0.5173076923076924,
1930
+ "grad_norm": 0.890625,
1931
+ "learning_rate": 8.866312718866669e-06,
1932
+ "loss": 0.9516,
1933
+ "step": 269
1934
+ },
1935
+ {
1936
+ "epoch": 0.5192307692307693,
1937
+ "grad_norm": 0.921875,
1938
+ "learning_rate": 8.85621201663102e-06,
1939
+ "loss": 0.9049,
1940
+ "step": 270
1941
+ },
1942
+ {
1943
+ "epoch": 0.5211538461538462,
1944
+ "grad_norm": 0.8125,
1945
+ "learning_rate": 8.846072325009562e-06,
1946
+ "loss": 0.8953,
1947
+ "step": 271
1948
+ },
1949
+ {
1950
+ "epoch": 0.5230769230769231,
1951
+ "grad_norm": 0.8046875,
1952
+ "learning_rate": 8.83589374652268e-06,
1953
+ "loss": 0.8507,
1954
+ "step": 272
1955
+ },
1956
+ {
1957
+ "epoch": 0.525,
1958
+ "grad_norm": 0.8359375,
1959
+ "learning_rate": 8.825676384083936e-06,
1960
+ "loss": 0.8904,
1961
+ "step": 273
1962
+ },
1963
+ {
1964
+ "epoch": 0.5269230769230769,
1965
+ "grad_norm": 0.83203125,
1966
+ "learning_rate": 8.815420340999034e-06,
1967
+ "loss": 0.9469,
1968
+ "step": 274
1969
+ },
1970
+ {
1971
+ "epoch": 0.5288461538461539,
1972
+ "grad_norm": 0.90234375,
1973
+ "learning_rate": 8.805125720964766e-06,
1974
+ "loss": 0.9144,
1975
+ "step": 275
1976
+ },
1977
+ {
1978
+ "epoch": 0.5307692307692308,
1979
+ "grad_norm": 0.828125,
1980
+ "learning_rate": 8.79479262806797e-06,
1981
+ "loss": 0.8757,
1982
+ "step": 276
1983
+ },
1984
+ {
1985
+ "epoch": 0.5326923076923077,
1986
+ "grad_norm": 0.83203125,
1987
+ "learning_rate": 8.784421166784476e-06,
1988
+ "loss": 0.8781,
1989
+ "step": 277
1990
+ },
1991
+ {
1992
+ "epoch": 0.5346153846153846,
1993
+ "grad_norm": 0.89453125,
1994
+ "learning_rate": 8.774011441978046e-06,
1995
+ "loss": 0.9348,
1996
+ "step": 278
1997
+ },
1998
+ {
1999
+ "epoch": 0.5365384615384615,
2000
+ "grad_norm": 0.9296875,
2001
+ "learning_rate": 8.763563558899317e-06,
2002
+ "loss": 0.9949,
2003
+ "step": 279
2004
+ },
2005
+ {
2006
+ "epoch": 0.5384615384615384,
2007
+ "grad_norm": 0.8359375,
2008
+ "learning_rate": 8.75307762318474e-06,
2009
+ "loss": 0.9171,
2010
+ "step": 280
2011
+ },
2012
+ {
2013
+ "epoch": 0.5403846153846154,
2014
+ "grad_norm": 0.890625,
2015
+ "learning_rate": 8.742553740855507e-06,
2016
+ "loss": 1.0024,
2017
+ "step": 281
2018
+ },
2019
+ {
2020
+ "epoch": 0.5423076923076923,
2021
+ "grad_norm": 0.82421875,
2022
+ "learning_rate": 8.731992018316478e-06,
2023
+ "loss": 0.8619,
2024
+ "step": 282
2025
+ },
2026
+ {
2027
+ "epoch": 0.5442307692307692,
2028
+ "grad_norm": 0.7890625,
2029
+ "learning_rate": 8.721392562355113e-06,
2030
+ "loss": 0.955,
2031
+ "step": 283
2032
+ },
2033
+ {
2034
+ "epoch": 0.5461538461538461,
2035
+ "grad_norm": 1.046875,
2036
+ "learning_rate": 8.71075548014038e-06,
2037
+ "loss": 0.9189,
2038
+ "step": 284
2039
+ },
2040
+ {
2041
+ "epoch": 0.5480769230769231,
2042
+ "grad_norm": 0.9140625,
2043
+ "learning_rate": 8.700080879221689e-06,
2044
+ "loss": 0.9118,
2045
+ "step": 285
2046
+ },
2047
+ {
2048
+ "epoch": 0.55,
2049
+ "grad_norm": 0.8203125,
2050
+ "learning_rate": 8.689368867527781e-06,
2051
+ "loss": 0.8916,
2052
+ "step": 286
2053
+ },
2054
+ {
2055
+ "epoch": 0.551923076923077,
2056
+ "grad_norm": 0.86328125,
2057
+ "learning_rate": 8.67861955336566e-06,
2058
+ "loss": 0.8934,
2059
+ "step": 287
2060
+ },
2061
+ {
2062
+ "epoch": 0.5538461538461539,
2063
+ "grad_norm": 0.875,
2064
+ "learning_rate": 8.667833045419483e-06,
2065
+ "loss": 0.8921,
2066
+ "step": 288
2067
+ },
2068
+ {
2069
+ "epoch": 0.5557692307692308,
2070
+ "grad_norm": 0.87890625,
2071
+ "learning_rate": 8.657009452749466e-06,
2072
+ "loss": 1.0005,
2073
+ "step": 289
2074
+ },
2075
+ {
2076
+ "epoch": 0.5576923076923077,
2077
+ "grad_norm": 0.85546875,
2078
+ "learning_rate": 8.646148884790786e-06,
2079
+ "loss": 0.8828,
2080
+ "step": 290
2081
+ },
2082
+ {
2083
+ "epoch": 0.5596153846153846,
2084
+ "grad_norm": 0.82421875,
2085
+ "learning_rate": 8.635251451352463e-06,
2086
+ "loss": 0.8704,
2087
+ "step": 291
2088
+ },
2089
+ {
2090
+ "epoch": 0.5615384615384615,
2091
+ "grad_norm": 0.7890625,
2092
+ "learning_rate": 8.624317262616261e-06,
2093
+ "loss": 0.9182,
2094
+ "step": 292
2095
+ },
2096
+ {
2097
+ "epoch": 0.5634615384615385,
2098
+ "grad_norm": 0.86328125,
2099
+ "learning_rate": 8.613346429135567e-06,
2100
+ "loss": 0.9128,
2101
+ "step": 293
2102
+ },
2103
+ {
2104
+ "epoch": 0.5653846153846154,
2105
+ "grad_norm": 0.83984375,
2106
+ "learning_rate": 8.602339061834278e-06,
2107
+ "loss": 0.9893,
2108
+ "step": 294
2109
+ },
2110
+ {
2111
+ "epoch": 0.5673076923076923,
2112
+ "grad_norm": 0.84375,
2113
+ "learning_rate": 8.591295272005674e-06,
2114
+ "loss": 0.9299,
2115
+ "step": 295
2116
+ },
2117
+ {
2118
+ "epoch": 0.5692307692307692,
2119
+ "grad_norm": 0.77734375,
2120
+ "learning_rate": 8.5802151713113e-06,
2121
+ "loss": 0.894,
2122
+ "step": 296
2123
+ },
2124
+ {
2125
+ "epoch": 0.5711538461538461,
2126
+ "grad_norm": 0.8359375,
2127
+ "learning_rate": 8.569098871779828e-06,
2128
+ "loss": 0.9472,
2129
+ "step": 297
2130
+ },
2131
+ {
2132
+ "epoch": 0.573076923076923,
2133
+ "grad_norm": 0.828125,
2134
+ "learning_rate": 8.557946485805932e-06,
2135
+ "loss": 0.919,
2136
+ "step": 298
2137
+ },
2138
+ {
2139
+ "epoch": 0.575,
2140
+ "grad_norm": 0.8046875,
2141
+ "learning_rate": 8.546758126149148e-06,
2142
+ "loss": 0.882,
2143
+ "step": 299
2144
+ },
2145
+ {
2146
+ "epoch": 0.5769230769230769,
2147
+ "grad_norm": 0.88671875,
2148
+ "learning_rate": 8.535533905932739e-06,
2149
+ "loss": 0.8685,
2150
+ "step": 300
2151
+ },
2152
+ {
2153
+ "epoch": 0.5788461538461539,
2154
+ "grad_norm": 0.85546875,
2155
+ "learning_rate": 8.524273938642539e-06,
2156
+ "loss": 0.8966,
2157
+ "step": 301
2158
+ },
2159
+ {
2160
+ "epoch": 0.5807692307692308,
2161
+ "grad_norm": 0.796875,
2162
+ "learning_rate": 8.512978338125818e-06,
2163
+ "loss": 0.9205,
2164
+ "step": 302
2165
+ },
2166
+ {
2167
+ "epoch": 0.5826923076923077,
2168
+ "grad_norm": 0.83984375,
2169
+ "learning_rate": 8.501647218590127e-06,
2170
+ "loss": 0.9815,
2171
+ "step": 303
2172
+ },
2173
+ {
2174
+ "epoch": 0.5846153846153846,
2175
+ "grad_norm": 0.80859375,
2176
+ "learning_rate": 8.490280694602142e-06,
2177
+ "loss": 0.9317,
2178
+ "step": 304
2179
+ },
2180
+ {
2181
+ "epoch": 0.5865384615384616,
2182
+ "grad_norm": 0.7890625,
2183
+ "learning_rate": 8.478878881086505e-06,
2184
+ "loss": 0.9498,
2185
+ "step": 305
2186
+ },
2187
+ {
2188
+ "epoch": 0.5884615384615385,
2189
+ "grad_norm": 0.7578125,
2190
+ "learning_rate": 8.467441893324667e-06,
2191
+ "loss": 0.9088,
2192
+ "step": 306
2193
+ },
2194
+ {
2195
+ "epoch": 0.5903846153846154,
2196
+ "grad_norm": 0.81640625,
2197
+ "learning_rate": 8.455969846953711e-06,
2198
+ "loss": 0.8728,
2199
+ "step": 307
2200
+ },
2201
+ {
2202
+ "epoch": 0.5923076923076923,
2203
+ "grad_norm": 0.8046875,
2204
+ "learning_rate": 8.444462857965198e-06,
2205
+ "loss": 0.9345,
2206
+ "step": 308
2207
+ },
2208
+ {
2209
+ "epoch": 0.5942307692307692,
2210
+ "grad_norm": 0.8046875,
2211
+ "learning_rate": 8.432921042703985e-06,
2212
+ "loss": 0.8951,
2213
+ "step": 309
2214
+ },
2215
+ {
2216
+ "epoch": 0.5961538461538461,
2217
+ "grad_norm": 0.8046875,
2218
+ "learning_rate": 8.42134451786705e-06,
2219
+ "loss": 1.005,
2220
+ "step": 310
2221
+ },
2222
+ {
2223
+ "epoch": 0.5980769230769231,
2224
+ "grad_norm": 0.7734375,
2225
+ "learning_rate": 8.409733400502311e-06,
2226
+ "loss": 0.919,
2227
+ "step": 311
2228
+ },
2229
+ {
2230
+ "epoch": 0.6,
2231
+ "grad_norm": 0.81640625,
2232
+ "learning_rate": 8.398087808007447e-06,
2233
+ "loss": 0.882,
2234
+ "step": 312
2235
+ },
2236
+ {
2237
+ "epoch": 0.6019230769230769,
2238
+ "grad_norm": 0.84375,
2239
+ "learning_rate": 8.386407858128707e-06,
2240
+ "loss": 0.9254,
2241
+ "step": 313
2242
+ },
2243
+ {
2244
+ "epoch": 0.6038461538461538,
2245
+ "grad_norm": 0.77734375,
2246
+ "learning_rate": 8.374693668959717e-06,
2247
+ "loss": 0.9312,
2248
+ "step": 314
2249
+ },
2250
+ {
2251
+ "epoch": 0.6057692307692307,
2252
+ "grad_norm": 0.77734375,
2253
+ "learning_rate": 8.362945358940295e-06,
2254
+ "loss": 0.9124,
2255
+ "step": 315
2256
+ },
2257
+ {
2258
+ "epoch": 0.6076923076923076,
2259
+ "grad_norm": 0.765625,
2260
+ "learning_rate": 8.351163046855246e-06,
2261
+ "loss": 0.9181,
2262
+ "step": 316
2263
+ },
2264
+ {
2265
+ "epoch": 0.6096153846153847,
2266
+ "grad_norm": 0.77734375,
2267
+ "learning_rate": 8.339346851833163e-06,
2268
+ "loss": 0.9124,
2269
+ "step": 317
2270
+ },
2271
+ {
2272
+ "epoch": 0.6115384615384616,
2273
+ "grad_norm": 0.78515625,
2274
+ "learning_rate": 8.327496893345223e-06,
2275
+ "loss": 0.9282,
2276
+ "step": 318
2277
+ },
2278
+ {
2279
+ "epoch": 0.6134615384615385,
2280
+ "grad_norm": 0.7421875,
2281
+ "learning_rate": 8.315613291203977e-06,
2282
+ "loss": 0.9125,
2283
+ "step": 319
2284
+ },
2285
+ {
2286
+ "epoch": 0.6153846153846154,
2287
+ "grad_norm": 0.7734375,
2288
+ "learning_rate": 8.303696165562141e-06,
2289
+ "loss": 0.9366,
2290
+ "step": 320
2291
+ },
2292
+ {
2293
+ "epoch": 0.6173076923076923,
2294
+ "grad_norm": 0.78515625,
2295
+ "learning_rate": 8.291745636911382e-06,
2296
+ "loss": 0.9808,
2297
+ "step": 321
2298
+ },
2299
+ {
2300
+ "epoch": 0.6192307692307693,
2301
+ "grad_norm": 0.74609375,
2302
+ "learning_rate": 8.279761826081096e-06,
2303
+ "loss": 0.9105,
2304
+ "step": 322
2305
+ },
2306
+ {
2307
+ "epoch": 0.6211538461538462,
2308
+ "grad_norm": 0.79296875,
2309
+ "learning_rate": 8.26774485423719e-06,
2310
+ "loss": 0.9444,
2311
+ "step": 323
2312
+ },
2313
+ {
2314
+ "epoch": 0.6230769230769231,
2315
+ "grad_norm": 0.79296875,
2316
+ "learning_rate": 8.255694842880854e-06,
2317
+ "loss": 0.8981,
2318
+ "step": 324
2319
+ },
2320
+ {
2321
+ "epoch": 0.625,
2322
+ "grad_norm": 0.76171875,
2323
+ "learning_rate": 8.243611913847337e-06,
2324
+ "loss": 0.8919,
2325
+ "step": 325
2326
+ },
2327
+ {
2328
+ "epoch": 0.625,
2329
+ "eval_loss": 0.9419646263122559,
2330
+ "eval_runtime": 34.6043,
2331
+ "eval_samples_per_second": 67.708,
2332
+ "eval_steps_per_second": 16.934,
2333
+ "step": 325
2334
+ },
2335
+ {
2336
+ "epoch": 0.6269230769230769,
2337
+ "grad_norm": 0.76171875,
2338
+ "learning_rate": 8.231496189304704e-06,
2339
+ "loss": 0.8434,
2340
+ "step": 326
2341
+ },
2342
+ {
2343
+ "epoch": 0.6288461538461538,
2344
+ "grad_norm": 0.8671875,
2345
+ "learning_rate": 8.21934779175262e-06,
2346
+ "loss": 0.912,
2347
+ "step": 327
2348
+ },
2349
+ {
2350
+ "epoch": 0.6307692307692307,
2351
+ "grad_norm": 0.7890625,
2352
+ "learning_rate": 8.207166844021093e-06,
2353
+ "loss": 0.9085,
2354
+ "step": 328
2355
+ },
2356
+ {
2357
+ "epoch": 0.6326923076923077,
2358
+ "grad_norm": 0.77734375,
2359
+ "learning_rate": 8.19495346926924e-06,
2360
+ "loss": 0.9301,
2361
+ "step": 329
2362
+ },
2363
+ {
2364
+ "epoch": 0.6346153846153846,
2365
+ "grad_norm": 0.75,
2366
+ "learning_rate": 8.182707790984043e-06,
2367
+ "loss": 0.9023,
2368
+ "step": 330
2369
+ },
2370
+ {
2371
+ "epoch": 0.6365384615384615,
2372
+ "grad_norm": 0.76953125,
2373
+ "learning_rate": 8.170429932979097e-06,
2374
+ "loss": 0.9363,
2375
+ "step": 331
2376
+ },
2377
+ {
2378
+ "epoch": 0.6384615384615384,
2379
+ "grad_norm": 0.76953125,
2380
+ "learning_rate": 8.15812001939336e-06,
2381
+ "loss": 0.927,
2382
+ "step": 332
2383
+ },
2384
+ {
2385
+ "epoch": 0.6403846153846153,
2386
+ "grad_norm": 0.8671875,
2387
+ "learning_rate": 8.145778174689897e-06,
2388
+ "loss": 0.9826,
2389
+ "step": 333
2390
+ },
2391
+ {
2392
+ "epoch": 0.6423076923076924,
2393
+ "grad_norm": 0.8125,
2394
+ "learning_rate": 8.133404523654626e-06,
2395
+ "loss": 0.922,
2396
+ "step": 334
2397
+ },
2398
+ {
2399
+ "epoch": 0.6442307692307693,
2400
+ "grad_norm": 0.7578125,
2401
+ "learning_rate": 8.120999191395048e-06,
2402
+ "loss": 0.9405,
2403
+ "step": 335
2404
+ },
2405
+ {
2406
+ "epoch": 0.6461538461538462,
2407
+ "grad_norm": 0.78515625,
2408
+ "learning_rate": 8.108562303338987e-06,
2409
+ "loss": 0.8947,
2410
+ "step": 336
2411
+ },
2412
+ {
2413
+ "epoch": 0.6480769230769231,
2414
+ "grad_norm": 0.74609375,
2415
+ "learning_rate": 8.096093985233323e-06,
2416
+ "loss": 0.9109,
2417
+ "step": 337
2418
+ },
2419
+ {
2420
+ "epoch": 0.65,
2421
+ "grad_norm": 0.8125,
2422
+ "learning_rate": 8.083594363142717e-06,
2423
+ "loss": 0.9111,
2424
+ "step": 338
2425
+ },
2426
+ {
2427
+ "epoch": 0.6519230769230769,
2428
+ "grad_norm": 0.78515625,
2429
+ "learning_rate": 8.071063563448341e-06,
2430
+ "loss": 0.8957,
2431
+ "step": 339
2432
+ },
2433
+ {
2434
+ "epoch": 0.6538461538461539,
2435
+ "grad_norm": 0.74609375,
2436
+ "learning_rate": 8.058501712846594e-06,
2437
+ "loss": 0.9003,
2438
+ "step": 340
2439
+ },
2440
+ {
2441
+ "epoch": 0.6557692307692308,
2442
+ "grad_norm": 0.7890625,
2443
+ "learning_rate": 8.045908938347828e-06,
2444
+ "loss": 0.9372,
2445
+ "step": 341
2446
+ },
2447
+ {
2448
+ "epoch": 0.6576923076923077,
2449
+ "grad_norm": 0.75390625,
2450
+ "learning_rate": 8.03328536727506e-06,
2451
+ "loss": 0.8854,
2452
+ "step": 342
2453
+ },
2454
+ {
2455
+ "epoch": 0.6596153846153846,
2456
+ "grad_norm": 0.77734375,
2457
+ "learning_rate": 8.020631127262681e-06,
2458
+ "loss": 0.9505,
2459
+ "step": 343
2460
+ },
2461
+ {
2462
+ "epoch": 0.6615384615384615,
2463
+ "grad_norm": 0.765625,
2464
+ "learning_rate": 8.007946346255176e-06,
2465
+ "loss": 0.9581,
2466
+ "step": 344
2467
+ },
2468
+ {
2469
+ "epoch": 0.6634615384615384,
2470
+ "grad_norm": 0.83984375,
2471
+ "learning_rate": 7.995231152505815e-06,
2472
+ "loss": 0.9068,
2473
+ "step": 345
2474
+ },
2475
+ {
2476
+ "epoch": 0.6653846153846154,
2477
+ "grad_norm": 0.78125,
2478
+ "learning_rate": 7.982485674575373e-06,
2479
+ "loss": 0.9159,
2480
+ "step": 346
2481
+ },
2482
+ {
2483
+ "epoch": 0.6673076923076923,
2484
+ "grad_norm": 0.765625,
2485
+ "learning_rate": 7.96971004133082e-06,
2486
+ "loss": 0.9015,
2487
+ "step": 347
2488
+ },
2489
+ {
2490
+ "epoch": 0.6692307692307692,
2491
+ "grad_norm": 0.796875,
2492
+ "learning_rate": 7.95690438194402e-06,
2493
+ "loss": 0.9531,
2494
+ "step": 348
2495
+ },
2496
+ {
2497
+ "epoch": 0.6711538461538461,
2498
+ "grad_norm": 0.8046875,
2499
+ "learning_rate": 7.944068825890424e-06,
2500
+ "loss": 0.8971,
2501
+ "step": 349
2502
+ },
2503
+ {
2504
+ "epoch": 0.6730769230769231,
2505
+ "grad_norm": 0.83203125,
2506
+ "learning_rate": 7.931203502947762e-06,
2507
+ "loss": 0.868,
2508
+ "step": 350
2509
+ },
2510
+ {
2511
+ "epoch": 0.675,
2512
+ "grad_norm": 0.734375,
2513
+ "learning_rate": 7.918308543194735e-06,
2514
+ "loss": 0.9151,
2515
+ "step": 351
2516
+ },
2517
+ {
2518
+ "epoch": 0.676923076923077,
2519
+ "grad_norm": 0.77734375,
2520
+ "learning_rate": 7.905384077009693e-06,
2521
+ "loss": 0.9949,
2522
+ "step": 352
2523
+ },
2524
+ {
2525
+ "epoch": 0.6788461538461539,
2526
+ "grad_norm": 0.77734375,
2527
+ "learning_rate": 7.892430235069317e-06,
2528
+ "loss": 0.9025,
2529
+ "step": 353
2530
+ },
2531
+ {
2532
+ "epoch": 0.6807692307692308,
2533
+ "grad_norm": 0.76171875,
2534
+ "learning_rate": 7.879447148347307e-06,
2535
+ "loss": 0.8969,
2536
+ "step": 354
2537
+ },
2538
+ {
2539
+ "epoch": 0.6826923076923077,
2540
+ "grad_norm": 0.8828125,
2541
+ "learning_rate": 7.866434948113046e-06,
2542
+ "loss": 0.9086,
2543
+ "step": 355
2544
+ },
2545
+ {
2546
+ "epoch": 0.6846153846153846,
2547
+ "grad_norm": 0.78125,
2548
+ "learning_rate": 7.853393765930279e-06,
2549
+ "loss": 0.865,
2550
+ "step": 356
2551
+ },
2552
+ {
2553
+ "epoch": 0.6865384615384615,
2554
+ "grad_norm": 0.78515625,
2555
+ "learning_rate": 7.84032373365578e-06,
2556
+ "loss": 0.9514,
2557
+ "step": 357
2558
+ },
2559
+ {
2560
+ "epoch": 0.6884615384615385,
2561
+ "grad_norm": 0.7421875,
2562
+ "learning_rate": 7.827224983438024e-06,
2563
+ "loss": 0.8866,
2564
+ "step": 358
2565
+ },
2566
+ {
2567
+ "epoch": 0.6903846153846154,
2568
+ "grad_norm": 0.8671875,
2569
+ "learning_rate": 7.814097647715848e-06,
2570
+ "loss": 0.8856,
2571
+ "step": 359
2572
+ },
2573
+ {
2574
+ "epoch": 0.6923076923076923,
2575
+ "grad_norm": 0.78125,
2576
+ "learning_rate": 7.800941859217103e-06,
2577
+ "loss": 0.8864,
2578
+ "step": 360
2579
+ },
2580
+ {
2581
+ "epoch": 0.6942307692307692,
2582
+ "grad_norm": 0.75390625,
2583
+ "learning_rate": 7.787757750957335e-06,
2584
+ "loss": 0.9212,
2585
+ "step": 361
2586
+ },
2587
+ {
2588
+ "epoch": 0.6961538461538461,
2589
+ "grad_norm": 0.796875,
2590
+ "learning_rate": 7.77454545623841e-06,
2591
+ "loss": 0.9006,
2592
+ "step": 362
2593
+ },
2594
+ {
2595
+ "epoch": 0.698076923076923,
2596
+ "grad_norm": 0.7578125,
2597
+ "learning_rate": 7.761305108647188e-06,
2598
+ "loss": 0.9427,
2599
+ "step": 363
2600
+ },
2601
+ {
2602
+ "epoch": 0.7,
2603
+ "grad_norm": 0.78125,
2604
+ "learning_rate": 7.74803684205417e-06,
2605
+ "loss": 0.9583,
2606
+ "step": 364
2607
+ },
2608
+ {
2609
+ "epoch": 0.7019230769230769,
2610
+ "grad_norm": 0.80078125,
2611
+ "learning_rate": 7.734740790612137e-06,
2612
+ "loss": 0.9303,
2613
+ "step": 365
2614
+ },
2615
+ {
2616
+ "epoch": 0.7038461538461539,
2617
+ "grad_norm": 0.796875,
2618
+ "learning_rate": 7.72141708875479e-06,
2619
+ "loss": 0.9017,
2620
+ "step": 366
2621
+ },
2622
+ {
2623
+ "epoch": 0.7057692307692308,
2624
+ "grad_norm": 0.7421875,
2625
+ "learning_rate": 7.708065871195413e-06,
2626
+ "loss": 0.9247,
2627
+ "step": 367
2628
+ },
2629
+ {
2630
+ "epoch": 0.7076923076923077,
2631
+ "grad_norm": 0.7109375,
2632
+ "learning_rate": 7.694687272925487e-06,
2633
+ "loss": 0.8598,
2634
+ "step": 368
2635
+ },
2636
+ {
2637
+ "epoch": 0.7096153846153846,
2638
+ "grad_norm": 0.73046875,
2639
+ "learning_rate": 7.681281429213328e-06,
2640
+ "loss": 0.9719,
2641
+ "step": 369
2642
+ },
2643
+ {
2644
+ "epoch": 0.7115384615384616,
2645
+ "grad_norm": 0.76171875,
2646
+ "learning_rate": 7.667848475602735e-06,
2647
+ "loss": 0.9588,
2648
+ "step": 370
2649
+ },
2650
+ {
2651
+ "epoch": 0.7134615384615385,
2652
+ "grad_norm": 0.73046875,
2653
+ "learning_rate": 7.654388547911605e-06,
2654
+ "loss": 0.8185,
2655
+ "step": 371
2656
+ },
2657
+ {
2658
+ "epoch": 0.7153846153846154,
2659
+ "grad_norm": 0.73046875,
2660
+ "learning_rate": 7.640901782230567e-06,
2661
+ "loss": 0.93,
2662
+ "step": 372
2663
+ },
2664
+ {
2665
+ "epoch": 0.7173076923076923,
2666
+ "grad_norm": 0.734375,
2667
+ "learning_rate": 7.627388314921602e-06,
2668
+ "loss": 0.9846,
2669
+ "step": 373
2670
+ },
2671
+ {
2672
+ "epoch": 0.7192307692307692,
2673
+ "grad_norm": 0.74609375,
2674
+ "learning_rate": 7.613848282616665e-06,
2675
+ "loss": 0.9807,
2676
+ "step": 374
2677
+ },
2678
+ {
2679
+ "epoch": 0.7211538461538461,
2680
+ "grad_norm": 0.75390625,
2681
+ "learning_rate": 7.600281822216307e-06,
2682
+ "loss": 0.9011,
2683
+ "step": 375
2684
+ },
2685
+ {
2686
+ "epoch": 0.7230769230769231,
2687
+ "grad_norm": 0.74609375,
2688
+ "learning_rate": 7.586689070888284e-06,
2689
+ "loss": 0.8961,
2690
+ "step": 376
2691
+ },
2692
+ {
2693
+ "epoch": 0.725,
2694
+ "grad_norm": 0.7890625,
2695
+ "learning_rate": 7.5730701660661795e-06,
2696
+ "loss": 0.9279,
2697
+ "step": 377
2698
+ },
2699
+ {
2700
+ "epoch": 0.7269230769230769,
2701
+ "grad_norm": 0.74609375,
2702
+ "learning_rate": 7.559425245448006e-06,
2703
+ "loss": 0.9177,
2704
+ "step": 378
2705
+ },
2706
+ {
2707
+ "epoch": 0.7288461538461538,
2708
+ "grad_norm": 0.734375,
2709
+ "learning_rate": 7.5457544469948164e-06,
2710
+ "loss": 0.9309,
2711
+ "step": 379
2712
+ },
2713
+ {
2714
+ "epoch": 0.7307692307692307,
2715
+ "grad_norm": 0.75,
2716
+ "learning_rate": 7.532057908929311e-06,
2717
+ "loss": 0.8937,
2718
+ "step": 380
2719
+ },
2720
+ {
2721
+ "epoch": 0.7326923076923076,
2722
+ "grad_norm": 0.78515625,
2723
+ "learning_rate": 7.5183357697344395e-06,
2724
+ "loss": 0.895,
2725
+ "step": 381
2726
+ },
2727
+ {
2728
+ "epoch": 0.7346153846153847,
2729
+ "grad_norm": 0.7421875,
2730
+ "learning_rate": 7.504588168151994e-06,
2731
+ "loss": 0.9167,
2732
+ "step": 382
2733
+ },
2734
+ {
2735
+ "epoch": 0.7365384615384616,
2736
+ "grad_norm": 0.7421875,
2737
+ "learning_rate": 7.4908152431812175e-06,
2738
+ "loss": 0.921,
2739
+ "step": 383
2740
+ },
2741
+ {
2742
+ "epoch": 0.7384615384615385,
2743
+ "grad_norm": 0.75390625,
2744
+ "learning_rate": 7.477017134077389e-06,
2745
+ "loss": 0.8987,
2746
+ "step": 384
2747
+ },
2748
+ {
2749
+ "epoch": 0.7403846153846154,
2750
+ "grad_norm": 0.7578125,
2751
+ "learning_rate": 7.4631939803504215e-06,
2752
+ "loss": 0.8866,
2753
+ "step": 385
2754
+ },
2755
+ {
2756
+ "epoch": 0.7423076923076923,
2757
+ "grad_norm": 0.73828125,
2758
+ "learning_rate": 7.449345921763449e-06,
2759
+ "loss": 0.8745,
2760
+ "step": 386
2761
+ },
2762
+ {
2763
+ "epoch": 0.7442307692307693,
2764
+ "grad_norm": 0.75390625,
2765
+ "learning_rate": 7.435473098331411e-06,
2766
+ "loss": 0.865,
2767
+ "step": 387
2768
+ },
2769
+ {
2770
+ "epoch": 0.7461538461538462,
2771
+ "grad_norm": 0.78125,
2772
+ "learning_rate": 7.421575650319641e-06,
2773
+ "loss": 0.8841,
2774
+ "step": 388
2775
+ },
2776
+ {
2777
+ "epoch": 0.7480769230769231,
2778
+ "grad_norm": 0.765625,
2779
+ "learning_rate": 7.407653718242449e-06,
2780
+ "loss": 0.9637,
2781
+ "step": 389
2782
+ },
2783
+ {
2784
+ "epoch": 0.75,
2785
+ "grad_norm": 0.70703125,
2786
+ "learning_rate": 7.393707442861693e-06,
2787
+ "loss": 0.914,
2788
+ "step": 390
2789
+ },
2790
+ {
2791
+ "epoch": 0.75,
2792
+ "eval_loss": 0.9376137256622314,
2793
+ "eval_runtime": 34.5412,
2794
+ "eval_samples_per_second": 67.832,
2795
+ "eval_steps_per_second": 16.965,
2796
+ "step": 390
2797
+ },
2798
+ {
2799
+ "epoch": 0.7519230769230769,
2800
+ "grad_norm": 0.74609375,
2801
+ "learning_rate": 7.379736965185369e-06,
2802
+ "loss": 0.9394,
2803
+ "step": 391
2804
+ },
2805
+ {
2806
+ "epoch": 0.7538461538461538,
2807
+ "grad_norm": 0.77734375,
2808
+ "learning_rate": 7.365742426466169e-06,
2809
+ "loss": 0.9122,
2810
+ "step": 392
2811
+ },
2812
+ {
2813
+ "epoch": 0.7557692307692307,
2814
+ "grad_norm": 0.73046875,
2815
+ "learning_rate": 7.3517239682000675e-06,
2816
+ "loss": 0.9033,
2817
+ "step": 393
2818
+ },
2819
+ {
2820
+ "epoch": 0.7576923076923077,
2821
+ "grad_norm": 0.97265625,
2822
+ "learning_rate": 7.337681732124882e-06,
2823
+ "loss": 0.8908,
2824
+ "step": 394
2825
+ },
2826
+ {
2827
+ "epoch": 0.7596153846153846,
2828
+ "grad_norm": 0.734375,
2829
+ "learning_rate": 7.323615860218844e-06,
2830
+ "loss": 0.8938,
2831
+ "step": 395
2832
+ },
2833
+ {
2834
+ "epoch": 0.7615384615384615,
2835
+ "grad_norm": 0.75390625,
2836
+ "learning_rate": 7.30952649469916e-06,
2837
+ "loss": 0.9013,
2838
+ "step": 396
2839
+ },
2840
+ {
2841
+ "epoch": 0.7634615384615384,
2842
+ "grad_norm": 0.75390625,
2843
+ "learning_rate": 7.295413778020579e-06,
2844
+ "loss": 0.9203,
2845
+ "step": 397
2846
+ },
2847
+ {
2848
+ "epoch": 0.7653846153846153,
2849
+ "grad_norm": 0.76171875,
2850
+ "learning_rate": 7.281277852873947e-06,
2851
+ "loss": 0.9713,
2852
+ "step": 398
2853
+ },
2854
+ {
2855
+ "epoch": 0.7673076923076924,
2856
+ "grad_norm": 0.7890625,
2857
+ "learning_rate": 7.267118862184767e-06,
2858
+ "loss": 0.9376,
2859
+ "step": 399
2860
+ },
2861
+ {
2862
+ "epoch": 0.7692307692307693,
2863
+ "grad_norm": 0.765625,
2864
+ "learning_rate": 7.252936949111749e-06,
2865
+ "loss": 0.9329,
2866
+ "step": 400
2867
+ },
2868
+ {
2869
+ "epoch": 0.7711538461538462,
2870
+ "grad_norm": 0.734375,
2871
+ "learning_rate": 7.2387322570453724e-06,
2872
+ "loss": 0.8421,
2873
+ "step": 401
2874
+ },
2875
+ {
2876
+ "epoch": 0.7730769230769231,
2877
+ "grad_norm": 0.75,
2878
+ "learning_rate": 7.224504929606429e-06,
2879
+ "loss": 0.8929,
2880
+ "step": 402
2881
+ },
2882
+ {
2883
+ "epoch": 0.775,
2884
+ "grad_norm": 0.75390625,
2885
+ "learning_rate": 7.210255110644569e-06,
2886
+ "loss": 0.9063,
2887
+ "step": 403
2888
+ },
2889
+ {
2890
+ "epoch": 0.7769230769230769,
2891
+ "grad_norm": 0.7265625,
2892
+ "learning_rate": 7.195982944236853e-06,
2893
+ "loss": 0.9735,
2894
+ "step": 404
2895
+ },
2896
+ {
2897
+ "epoch": 0.7788461538461539,
2898
+ "grad_norm": 0.7109375,
2899
+ "learning_rate": 7.181688574686292e-06,
2900
+ "loss": 0.8794,
2901
+ "step": 405
2902
+ },
2903
+ {
2904
+ "epoch": 0.7807692307692308,
2905
+ "grad_norm": 0.75,
2906
+ "learning_rate": 7.167372146520386e-06,
2907
+ "loss": 0.8891,
2908
+ "step": 406
2909
+ },
2910
+ {
2911
+ "epoch": 0.7826923076923077,
2912
+ "grad_norm": 0.71484375,
2913
+ "learning_rate": 7.15303380448967e-06,
2914
+ "loss": 0.8951,
2915
+ "step": 407
2916
+ },
2917
+ {
2918
+ "epoch": 0.7846153846153846,
2919
+ "grad_norm": 0.78125,
2920
+ "learning_rate": 7.138673693566241e-06,
2921
+ "loss": 0.897,
2922
+ "step": 408
2923
+ },
2924
+ {
2925
+ "epoch": 0.7865384615384615,
2926
+ "grad_norm": 0.796875,
2927
+ "learning_rate": 7.1242919589422974e-06,
2928
+ "loss": 0.9431,
2929
+ "step": 409
2930
+ },
2931
+ {
2932
+ "epoch": 0.7884615384615384,
2933
+ "grad_norm": 0.74609375,
2934
+ "learning_rate": 7.1098887460286745e-06,
2935
+ "loss": 0.8704,
2936
+ "step": 410
2937
+ },
2938
+ {
2939
+ "epoch": 0.7903846153846154,
2940
+ "grad_norm": 0.765625,
2941
+ "learning_rate": 7.095464200453366e-06,
2942
+ "loss": 0.9813,
2943
+ "step": 411
2944
+ },
2945
+ {
2946
+ "epoch": 0.7923076923076923,
2947
+ "grad_norm": 0.734375,
2948
+ "learning_rate": 7.081018468060057e-06,
2949
+ "loss": 0.8657,
2950
+ "step": 412
2951
+ },
2952
+ {
2953
+ "epoch": 0.7942307692307692,
2954
+ "grad_norm": 0.7578125,
2955
+ "learning_rate": 7.066551694906651e-06,
2956
+ "loss": 0.9216,
2957
+ "step": 413
2958
+ },
2959
+ {
2960
+ "epoch": 0.7961538461538461,
2961
+ "grad_norm": 0.734375,
2962
+ "learning_rate": 7.052064027263785e-06,
2963
+ "loss": 0.9203,
2964
+ "step": 414
2965
+ },
2966
+ {
2967
+ "epoch": 0.7980769230769231,
2968
+ "grad_norm": 0.73828125,
2969
+ "learning_rate": 7.0375556116133605e-06,
2970
+ "loss": 0.9002,
2971
+ "step": 415
2972
+ },
2973
+ {
2974
+ "epoch": 0.8,
2975
+ "grad_norm": 0.73046875,
2976
+ "learning_rate": 7.023026594647057e-06,
2977
+ "loss": 0.9279,
2978
+ "step": 416
2979
+ },
2980
+ {
2981
+ "epoch": 0.801923076923077,
2982
+ "grad_norm": 0.71875,
2983
+ "learning_rate": 7.008477123264849e-06,
2984
+ "loss": 0.8851,
2985
+ "step": 417
2986
+ },
2987
+ {
2988
+ "epoch": 0.8038461538461539,
2989
+ "grad_norm": 0.72265625,
2990
+ "learning_rate": 6.9939073445735205e-06,
2991
+ "loss": 0.8718,
2992
+ "step": 418
2993
+ },
2994
+ {
2995
+ "epoch": 0.8057692307692308,
2996
+ "grad_norm": 0.703125,
2997
+ "learning_rate": 6.9793174058851805e-06,
2998
+ "loss": 0.8874,
2999
+ "step": 419
3000
+ },
3001
+ {
3002
+ "epoch": 0.8076923076923077,
3003
+ "grad_norm": 0.73828125,
3004
+ "learning_rate": 6.964707454715772e-06,
3005
+ "loss": 0.8747,
3006
+ "step": 420
3007
+ },
3008
+ {
3009
+ "epoch": 0.8096153846153846,
3010
+ "grad_norm": 0.7578125,
3011
+ "learning_rate": 6.9500776387835785e-06,
3012
+ "loss": 0.9146,
3013
+ "step": 421
3014
+ },
3015
+ {
3016
+ "epoch": 0.8115384615384615,
3017
+ "grad_norm": 0.78515625,
3018
+ "learning_rate": 6.935428106007734e-06,
3019
+ "loss": 0.9598,
3020
+ "step": 422
3021
+ },
3022
+ {
3023
+ "epoch": 0.8134615384615385,
3024
+ "grad_norm": 0.75390625,
3025
+ "learning_rate": 6.920759004506723e-06,
3026
+ "loss": 0.873,
3027
+ "step": 423
3028
+ },
3029
+ {
3030
+ "epoch": 0.8153846153846154,
3031
+ "grad_norm": 0.80859375,
3032
+ "learning_rate": 6.906070482596887e-06,
3033
+ "loss": 0.9395,
3034
+ "step": 424
3035
+ },
3036
+ {
3037
+ "epoch": 0.8173076923076923,
3038
+ "grad_norm": 0.71484375,
3039
+ "learning_rate": 6.891362688790925e-06,
3040
+ "loss": 0.8713,
3041
+ "step": 425
3042
+ },
3043
+ {
3044
+ "epoch": 0.8192307692307692,
3045
+ "grad_norm": 0.7109375,
3046
+ "learning_rate": 6.876635771796386e-06,
3047
+ "loss": 0.8427,
3048
+ "step": 426
3049
+ },
3050
+ {
3051
+ "epoch": 0.8211538461538461,
3052
+ "grad_norm": 0.75,
3053
+ "learning_rate": 6.8618898805141744e-06,
3054
+ "loss": 0.9148,
3055
+ "step": 427
3056
+ },
3057
+ {
3058
+ "epoch": 0.823076923076923,
3059
+ "grad_norm": 0.74609375,
3060
+ "learning_rate": 6.847125164037036e-06,
3061
+ "loss": 0.8788,
3062
+ "step": 428
3063
+ },
3064
+ {
3065
+ "epoch": 0.825,
3066
+ "grad_norm": 0.72265625,
3067
+ "learning_rate": 6.832341771648057e-06,
3068
+ "loss": 0.8523,
3069
+ "step": 429
3070
+ },
3071
+ {
3072
+ "epoch": 0.8269230769230769,
3073
+ "grad_norm": 0.7265625,
3074
+ "learning_rate": 6.817539852819149e-06,
3075
+ "loss": 0.869,
3076
+ "step": 430
3077
+ },
3078
+ {
3079
+ "epoch": 0.8288461538461539,
3080
+ "grad_norm": 0.6953125,
3081
+ "learning_rate": 6.802719557209547e-06,
3082
+ "loss": 0.8934,
3083
+ "step": 431
3084
+ },
3085
+ {
3086
+ "epoch": 0.8307692307692308,
3087
+ "grad_norm": 0.7265625,
3088
+ "learning_rate": 6.787881034664283e-06,
3089
+ "loss": 0.9127,
3090
+ "step": 432
3091
+ },
3092
+ {
3093
+ "epoch": 0.8326923076923077,
3094
+ "grad_norm": 0.7421875,
3095
+ "learning_rate": 6.773024435212678e-06,
3096
+ "loss": 0.9617,
3097
+ "step": 433
3098
+ },
3099
+ {
3100
+ "epoch": 0.8346153846153846,
3101
+ "grad_norm": 0.7421875,
3102
+ "learning_rate": 6.758149909066832e-06,
3103
+ "loss": 0.8918,
3104
+ "step": 434
3105
+ },
3106
+ {
3107
+ "epoch": 0.8365384615384616,
3108
+ "grad_norm": 0.7734375,
3109
+ "learning_rate": 6.743257606620094e-06,
3110
+ "loss": 0.9721,
3111
+ "step": 435
3112
+ },
3113
+ {
3114
+ "epoch": 0.8384615384615385,
3115
+ "grad_norm": 0.734375,
3116
+ "learning_rate": 6.728347678445539e-06,
3117
+ "loss": 0.9183,
3118
+ "step": 436
3119
+ },
3120
+ {
3121
+ "epoch": 0.8403846153846154,
3122
+ "grad_norm": 0.7265625,
3123
+ "learning_rate": 6.713420275294467e-06,
3124
+ "loss": 0.8995,
3125
+ "step": 437
3126
+ },
3127
+ {
3128
+ "epoch": 0.8423076923076923,
3129
+ "grad_norm": 0.75,
3130
+ "learning_rate": 6.69847554809485e-06,
3131
+ "loss": 0.879,
3132
+ "step": 438
3133
+ },
3134
+ {
3135
+ "epoch": 0.8442307692307692,
3136
+ "grad_norm": 0.7421875,
3137
+ "learning_rate": 6.683513647949826e-06,
3138
+ "loss": 0.927,
3139
+ "step": 439
3140
+ },
3141
+ {
3142
+ "epoch": 0.8461538461538461,
3143
+ "grad_norm": 0.75390625,
3144
+ "learning_rate": 6.668534726136166e-06,
3145
+ "loss": 0.9,
3146
+ "step": 440
3147
+ },
3148
+ {
3149
+ "epoch": 0.8480769230769231,
3150
+ "grad_norm": 0.74609375,
3151
+ "learning_rate": 6.653538934102743e-06,
3152
+ "loss": 0.8526,
3153
+ "step": 441
3154
+ },
3155
+ {
3156
+ "epoch": 0.85,
3157
+ "grad_norm": 0.75,
3158
+ "learning_rate": 6.638526423468999e-06,
3159
+ "loss": 0.8354,
3160
+ "step": 442
3161
+ },
3162
+ {
3163
+ "epoch": 0.8519230769230769,
3164
+ "grad_norm": 0.75,
3165
+ "learning_rate": 6.6234973460234184e-06,
3166
+ "loss": 0.8852,
3167
+ "step": 443
3168
+ },
3169
+ {
3170
+ "epoch": 0.8538461538461538,
3171
+ "grad_norm": 0.75,
3172
+ "learning_rate": 6.608451853721985e-06,
3173
+ "loss": 0.9275,
3174
+ "step": 444
3175
+ },
3176
+ {
3177
+ "epoch": 0.8557692307692307,
3178
+ "grad_norm": 0.72265625,
3179
+ "learning_rate": 6.593390098686653e-06,
3180
+ "loss": 0.9023,
3181
+ "step": 445
3182
+ },
3183
+ {
3184
+ "epoch": 0.8576923076923076,
3185
+ "grad_norm": 0.75,
3186
+ "learning_rate": 6.578312233203804e-06,
3187
+ "loss": 0.8804,
3188
+ "step": 446
3189
+ },
3190
+ {
3191
+ "epoch": 0.8596153846153847,
3192
+ "grad_norm": 0.7109375,
3193
+ "learning_rate": 6.563218409722712e-06,
3194
+ "loss": 0.9229,
3195
+ "step": 447
3196
+ },
3197
+ {
3198
+ "epoch": 0.8615384615384616,
3199
+ "grad_norm": 0.71484375,
3200
+ "learning_rate": 6.548108780853995e-06,
3201
+ "loss": 0.8995,
3202
+ "step": 448
3203
+ },
3204
+ {
3205
+ "epoch": 0.8634615384615385,
3206
+ "grad_norm": 0.703125,
3207
+ "learning_rate": 6.532983499368078e-06,
3208
+ "loss": 0.8847,
3209
+ "step": 449
3210
+ },
3211
+ {
3212
+ "epoch": 0.8653846153846154,
3213
+ "grad_norm": 0.71484375,
3214
+ "learning_rate": 6.5178427181936485e-06,
3215
+ "loss": 0.923,
3216
+ "step": 450
3217
+ },
3218
+ {
3219
+ "epoch": 0.8673076923076923,
3220
+ "grad_norm": 0.73828125,
3221
+ "learning_rate": 6.502686590416105e-06,
3222
+ "loss": 0.8987,
3223
+ "step": 451
3224
+ },
3225
+ {
3226
+ "epoch": 0.8692307692307693,
3227
+ "grad_norm": 0.7109375,
3228
+ "learning_rate": 6.487515269276015e-06,
3229
+ "loss": 0.9345,
3230
+ "step": 452
3231
+ },
3232
+ {
3233
+ "epoch": 0.8711538461538462,
3234
+ "grad_norm": 0.74609375,
3235
+ "learning_rate": 6.472328908167562e-06,
3236
+ "loss": 0.8575,
3237
+ "step": 453
3238
+ },
3239
+ {
3240
+ "epoch": 0.8730769230769231,
3241
+ "grad_norm": 0.734375,
3242
+ "learning_rate": 6.457127660636994e-06,
3243
+ "loss": 0.9209,
3244
+ "step": 454
3245
+ },
3246
+ {
3247
+ "epoch": 0.875,
3248
+ "grad_norm": 0.74609375,
3249
+ "learning_rate": 6.441911680381074e-06,
3250
+ "loss": 0.8873,
3251
+ "step": 455
3252
+ },
3253
+ {
3254
+ "epoch": 0.875,
3255
+ "eval_loss": 0.9346491098403931,
3256
+ "eval_runtime": 34.5947,
3257
+ "eval_samples_per_second": 67.727,
3258
+ "eval_steps_per_second": 16.939,
3259
+ "step": 455
3260
+ },
3261
+ {
3262
+ "epoch": 0.8769230769230769,
3263
+ "grad_norm": 0.73046875,
3264
+ "learning_rate": 6.426681121245527e-06,
3265
+ "loss": 0.9187,
3266
+ "step": 456
3267
+ },
3268
+ {
3269
+ "epoch": 0.8788461538461538,
3270
+ "grad_norm": 0.77734375,
3271
+ "learning_rate": 6.411436137223479e-06,
3272
+ "loss": 0.9509,
3273
+ "step": 457
3274
+ },
3275
+ {
3276
+ "epoch": 0.8807692307692307,
3277
+ "grad_norm": 0.75,
3278
+ "learning_rate": 6.396176882453902e-06,
3279
+ "loss": 0.9401,
3280
+ "step": 458
3281
+ },
3282
+ {
3283
+ "epoch": 0.8826923076923077,
3284
+ "grad_norm": 0.73828125,
3285
+ "learning_rate": 6.38090351122006e-06,
3286
+ "loss": 0.8767,
3287
+ "step": 459
3288
+ },
3289
+ {
3290
+ "epoch": 0.8846153846153846,
3291
+ "grad_norm": 0.72265625,
3292
+ "learning_rate": 6.365616177947945e-06,
3293
+ "loss": 0.8637,
3294
+ "step": 460
3295
+ },
3296
+ {
3297
+ "epoch": 0.8865384615384615,
3298
+ "grad_norm": 0.74609375,
3299
+ "learning_rate": 6.350315037204714e-06,
3300
+ "loss": 0.9081,
3301
+ "step": 461
3302
+ },
3303
+ {
3304
+ "epoch": 0.8884615384615384,
3305
+ "grad_norm": 0.73828125,
3306
+ "learning_rate": 6.335000243697134e-06,
3307
+ "loss": 0.9054,
3308
+ "step": 462
3309
+ },
3310
+ {
3311
+ "epoch": 0.8903846153846153,
3312
+ "grad_norm": 0.73046875,
3313
+ "learning_rate": 6.319671952270004e-06,
3314
+ "loss": 0.9045,
3315
+ "step": 463
3316
+ },
3317
+ {
3318
+ "epoch": 0.8923076923076924,
3319
+ "grad_norm": 0.71484375,
3320
+ "learning_rate": 6.304330317904605e-06,
3321
+ "loss": 0.9227,
3322
+ "step": 464
3323
+ },
3324
+ {
3325
+ "epoch": 0.8942307692307693,
3326
+ "grad_norm": 0.69921875,
3327
+ "learning_rate": 6.288975495717124e-06,
3328
+ "loss": 0.8882,
3329
+ "step": 465
3330
+ },
3331
+ {
3332
+ "epoch": 0.8961538461538462,
3333
+ "grad_norm": 0.76171875,
3334
+ "learning_rate": 6.273607640957085e-06,
3335
+ "loss": 0.9967,
3336
+ "step": 466
3337
+ },
3338
+ {
3339
+ "epoch": 0.8980769230769231,
3340
+ "grad_norm": 0.73046875,
3341
+ "learning_rate": 6.258226909005783e-06,
3342
+ "loss": 0.9474,
3343
+ "step": 467
3344
+ },
3345
+ {
3346
+ "epoch": 0.9,
3347
+ "grad_norm": 0.7578125,
3348
+ "learning_rate": 6.2428334553747135e-06,
3349
+ "loss": 0.912,
3350
+ "step": 468
3351
+ },
3352
+ {
3353
+ "epoch": 0.9019230769230769,
3354
+ "grad_norm": 0.74609375,
3355
+ "learning_rate": 6.227427435703997e-06,
3356
+ "loss": 0.9355,
3357
+ "step": 469
3358
+ },
3359
+ {
3360
+ "epoch": 0.9038461538461539,
3361
+ "grad_norm": 0.75,
3362
+ "learning_rate": 6.212009005760805e-06,
3363
+ "loss": 0.9328,
3364
+ "step": 470
3365
+ },
3366
+ {
3367
+ "epoch": 0.9057692307692308,
3368
+ "grad_norm": 0.74609375,
3369
+ "learning_rate": 6.1965783214377895e-06,
3370
+ "loss": 0.9323,
3371
+ "step": 471
3372
+ },
3373
+ {
3374
+ "epoch": 0.9076923076923077,
3375
+ "grad_norm": 0.71484375,
3376
+ "learning_rate": 6.181135538751504e-06,
3377
+ "loss": 0.8865,
3378
+ "step": 472
3379
+ },
3380
+ {
3381
+ "epoch": 0.9096153846153846,
3382
+ "grad_norm": 0.73046875,
3383
+ "learning_rate": 6.165680813840822e-06,
3384
+ "loss": 0.9123,
3385
+ "step": 473
3386
+ },
3387
+ {
3388
+ "epoch": 0.9115384615384615,
3389
+ "grad_norm": 0.73828125,
3390
+ "learning_rate": 6.150214302965368e-06,
3391
+ "loss": 0.9209,
3392
+ "step": 474
3393
+ },
3394
+ {
3395
+ "epoch": 0.9134615384615384,
3396
+ "grad_norm": 0.7109375,
3397
+ "learning_rate": 6.134736162503929e-06,
3398
+ "loss": 0.9377,
3399
+ "step": 475
3400
+ },
3401
+ {
3402
+ "epoch": 0.9153846153846154,
3403
+ "grad_norm": 0.734375,
3404
+ "learning_rate": 6.119246548952877e-06,
3405
+ "loss": 0.9317,
3406
+ "step": 476
3407
+ },
3408
+ {
3409
+ "epoch": 0.9173076923076923,
3410
+ "grad_norm": 0.71484375,
3411
+ "learning_rate": 6.103745618924587e-06,
3412
+ "loss": 0.8839,
3413
+ "step": 477
3414
+ },
3415
+ {
3416
+ "epoch": 0.9192307692307692,
3417
+ "grad_norm": 0.71484375,
3418
+ "learning_rate": 6.088233529145849e-06,
3419
+ "loss": 0.8823,
3420
+ "step": 478
3421
+ },
3422
+ {
3423
+ "epoch": 0.9211538461538461,
3424
+ "grad_norm": 0.7421875,
3425
+ "learning_rate": 6.072710436456293e-06,
3426
+ "loss": 0.9031,
3427
+ "step": 479
3428
+ },
3429
+ {
3430
+ "epoch": 0.9230769230769231,
3431
+ "grad_norm": 0.7109375,
3432
+ "learning_rate": 6.057176497806791e-06,
3433
+ "loss": 0.9132,
3434
+ "step": 480
3435
+ },
3436
+ {
3437
+ "epoch": 0.925,
3438
+ "grad_norm": 0.7734375,
3439
+ "learning_rate": 6.041631870257882e-06,
3440
+ "loss": 0.8772,
3441
+ "step": 481
3442
+ },
3443
+ {
3444
+ "epoch": 0.926923076923077,
3445
+ "grad_norm": 0.73046875,
3446
+ "learning_rate": 6.026076710978172e-06,
3447
+ "loss": 0.901,
3448
+ "step": 482
3449
+ },
3450
+ {
3451
+ "epoch": 0.9288461538461539,
3452
+ "grad_norm": 0.74609375,
3453
+ "learning_rate": 6.010511177242757e-06,
3454
+ "loss": 0.9196,
3455
+ "step": 483
3456
+ },
3457
+ {
3458
+ "epoch": 0.9307692307692308,
3459
+ "grad_norm": 0.73046875,
3460
+ "learning_rate": 5.994935426431627e-06,
3461
+ "loss": 0.9718,
3462
+ "step": 484
3463
+ },
3464
+ {
3465
+ "epoch": 0.9326923076923077,
3466
+ "grad_norm": 0.76953125,
3467
+ "learning_rate": 5.979349616028067e-06,
3468
+ "loss": 0.963,
3469
+ "step": 485
3470
+ },
3471
+ {
3472
+ "epoch": 0.9346153846153846,
3473
+ "grad_norm": 0.70703125,
3474
+ "learning_rate": 5.963753903617084e-06,
3475
+ "loss": 0.9048,
3476
+ "step": 486
3477
+ },
3478
+ {
3479
+ "epoch": 0.9365384615384615,
3480
+ "grad_norm": 0.7578125,
3481
+ "learning_rate": 5.948148446883794e-06,
3482
+ "loss": 0.9705,
3483
+ "step": 487
3484
+ },
3485
+ {
3486
+ "epoch": 0.9384615384615385,
3487
+ "grad_norm": 0.71484375,
3488
+ "learning_rate": 5.932533403611835e-06,
3489
+ "loss": 0.8878,
3490
+ "step": 488
3491
+ },
3492
+ {
3493
+ "epoch": 0.9403846153846154,
3494
+ "grad_norm": 0.73046875,
3495
+ "learning_rate": 5.916908931681781e-06,
3496
+ "loss": 0.9049,
3497
+ "step": 489
3498
+ },
3499
+ {
3500
+ "epoch": 0.9423076923076923,
3501
+ "grad_norm": 0.703125,
3502
+ "learning_rate": 5.90127518906953e-06,
3503
+ "loss": 0.8733,
3504
+ "step": 490
3505
+ },
3506
+ {
3507
+ "epoch": 0.9442307692307692,
3508
+ "grad_norm": 0.69921875,
3509
+ "learning_rate": 5.885632333844714e-06,
3510
+ "loss": 0.8896,
3511
+ "step": 491
3512
+ },
3513
+ {
3514
+ "epoch": 0.9461538461538461,
3515
+ "grad_norm": 0.7109375,
3516
+ "learning_rate": 5.8699805241691065e-06,
3517
+ "loss": 0.9191,
3518
+ "step": 492
3519
+ },
3520
+ {
3521
+ "epoch": 0.948076923076923,
3522
+ "grad_norm": 0.69921875,
3523
+ "learning_rate": 5.854319918295012e-06,
3524
+ "loss": 0.8721,
3525
+ "step": 493
3526
+ },
3527
+ {
3528
+ "epoch": 0.95,
3529
+ "grad_norm": 0.73828125,
3530
+ "learning_rate": 5.838650674563674e-06,
3531
+ "loss": 0.8746,
3532
+ "step": 494
3533
+ },
3534
+ {
3535
+ "epoch": 0.9519230769230769,
3536
+ "grad_norm": 0.74609375,
3537
+ "learning_rate": 5.82297295140367e-06,
3538
+ "loss": 0.9072,
3539
+ "step": 495
3540
+ },
3541
+ {
3542
+ "epoch": 0.9538461538461539,
3543
+ "grad_norm": 0.74609375,
3544
+ "learning_rate": 5.807286907329315e-06,
3545
+ "loss": 0.8981,
3546
+ "step": 496
3547
+ },
3548
+ {
3549
+ "epoch": 0.9557692307692308,
3550
+ "grad_norm": 0.734375,
3551
+ "learning_rate": 5.79159270093905e-06,
3552
+ "loss": 0.9358,
3553
+ "step": 497
3554
+ },
3555
+ {
3556
+ "epoch": 0.9576923076923077,
3557
+ "grad_norm": 0.7109375,
3558
+ "learning_rate": 5.7758904909138495e-06,
3559
+ "loss": 0.915,
3560
+ "step": 498
3561
+ },
3562
+ {
3563
+ "epoch": 0.9596153846153846,
3564
+ "grad_norm": 0.72265625,
3565
+ "learning_rate": 5.760180436015604e-06,
3566
+ "loss": 0.8652,
3567
+ "step": 499
3568
+ },
3569
+ {
3570
+ "epoch": 0.9615384615384616,
3571
+ "grad_norm": 0.7265625,
3572
+ "learning_rate": 5.74446269508553e-06,
3573
+ "loss": 0.8926,
3574
+ "step": 500
3575
+ },
3576
+ {
3577
+ "epoch": 0.9634615384615385,
3578
+ "grad_norm": 0.71484375,
3579
+ "learning_rate": 5.7287374270425475e-06,
3580
+ "loss": 0.8889,
3581
+ "step": 501
3582
+ },
3583
+ {
3584
+ "epoch": 0.9653846153846154,
3585
+ "grad_norm": 0.703125,
3586
+ "learning_rate": 5.7130047908816884e-06,
3587
+ "loss": 0.9027,
3588
+ "step": 502
3589
+ },
3590
+ {
3591
+ "epoch": 0.9673076923076923,
3592
+ "grad_norm": 0.7421875,
3593
+ "learning_rate": 5.69726494567248e-06,
3594
+ "loss": 0.9419,
3595
+ "step": 503
3596
+ },
3597
+ {
3598
+ "epoch": 0.9692307692307692,
3599
+ "grad_norm": 0.73046875,
3600
+ "learning_rate": 5.681518050557336e-06,
3601
+ "loss": 0.9396,
3602
+ "step": 504
3603
+ },
3604
+ {
3605
+ "epoch": 0.9711538461538461,
3606
+ "grad_norm": 0.71875,
3607
+ "learning_rate": 5.6657642647499545e-06,
3608
+ "loss": 0.8828,
3609
+ "step": 505
3610
+ },
3611
+ {
3612
+ "epoch": 0.9730769230769231,
3613
+ "grad_norm": 0.765625,
3614
+ "learning_rate": 5.650003747533701e-06,
3615
+ "loss": 0.944,
3616
+ "step": 506
3617
+ },
3618
+ {
3619
+ "epoch": 0.975,
3620
+ "grad_norm": 0.73046875,
3621
+ "learning_rate": 5.6342366582600035e-06,
3622
+ "loss": 0.9072,
3623
+ "step": 507
3624
+ },
3625
+ {
3626
+ "epoch": 0.9769230769230769,
3627
+ "grad_norm": 0.7265625,
3628
+ "learning_rate": 5.61846315634674e-06,
3629
+ "loss": 0.9015,
3630
+ "step": 508
3631
+ },
3632
+ {
3633
+ "epoch": 0.9788461538461538,
3634
+ "grad_norm": 0.7265625,
3635
+ "learning_rate": 5.6026834012766155e-06,
3636
+ "loss": 0.9117,
3637
+ "step": 509
3638
+ },
3639
+ {
3640
+ "epoch": 0.9807692307692307,
3641
+ "grad_norm": 0.75,
3642
+ "learning_rate": 5.586897552595573e-06,
3643
+ "loss": 0.971,
3644
+ "step": 510
3645
+ },
3646
+ {
3647
+ "epoch": 0.9826923076923076,
3648
+ "grad_norm": 0.73828125,
3649
+ "learning_rate": 5.571105769911159e-06,
3650
+ "loss": 0.8729,
3651
+ "step": 511
3652
+ },
3653
+ {
3654
+ "epoch": 0.9846153846153847,
3655
+ "grad_norm": 0.72265625,
3656
+ "learning_rate": 5.555308212890917e-06,
3657
+ "loss": 0.9132,
3658
+ "step": 512
3659
+ },
3660
+ {
3661
+ "epoch": 0.9865384615384616,
3662
+ "grad_norm": 0.72265625,
3663
+ "learning_rate": 5.539505041260779e-06,
3664
+ "loss": 0.8606,
3665
+ "step": 513
3666
+ },
3667
+ {
3668
+ "epoch": 0.9884615384615385,
3669
+ "grad_norm": 0.703125,
3670
+ "learning_rate": 5.523696414803438e-06,
3671
+ "loss": 0.8937,
3672
+ "step": 514
3673
+ },
3674
+ {
3675
+ "epoch": 0.9903846153846154,
3676
+ "grad_norm": 0.73046875,
3677
+ "learning_rate": 5.507882493356745e-06,
3678
+ "loss": 0.913,
3679
+ "step": 515
3680
+ },
3681
+ {
3682
+ "epoch": 0.9923076923076923,
3683
+ "grad_norm": 4.5625,
3684
+ "learning_rate": 5.49206343681209e-06,
3685
+ "loss": 0.862,
3686
+ "step": 516
3687
+ },
3688
+ {
3689
+ "epoch": 0.9942307692307693,
3690
+ "grad_norm": 0.703125,
3691
+ "learning_rate": 5.476239405112775e-06,
3692
+ "loss": 0.8662,
3693
+ "step": 517
3694
+ },
3695
+ {
3696
+ "epoch": 0.9961538461538462,
3697
+ "grad_norm": 0.73046875,
3698
+ "learning_rate": 5.460410558252408e-06,
3699
+ "loss": 0.8443,
3700
+ "step": 518
3701
+ },
3702
+ {
3703
+ "epoch": 0.9980769230769231,
3704
+ "grad_norm": 0.7578125,
3705
+ "learning_rate": 5.444577056273284e-06,
3706
+ "loss": 0.9569,
3707
+ "step": 519
3708
+ },
3709
+ {
3710
+ "epoch": 1.0,
3711
+ "grad_norm": 0.73828125,
3712
+ "learning_rate": 5.428739059264767e-06,
3713
+ "loss": 0.8854,
3714
+ "step": 520
3715
+ },
3716
+ {
3717
+ "epoch": 1.0,
3718
+ "eval_loss": 0.9325999617576599,
3719
+ "eval_runtime": 34.5297,
3720
+ "eval_samples_per_second": 67.855,
3721
+ "eval_steps_per_second": 16.971,
3722
+ "step": 520
3723
+ }
3724
+ ],
3725
+ "logging_steps": 1,
3726
+ "max_steps": 1040,
3727
+ "num_input_tokens_seen": 0,
3728
+ "num_train_epochs": 2,
3729
+ "save_steps": 520,
3730
+ "stateful_callbacks": {
3731
+ "TrainerControl": {
3732
+ "args": {
3733
+ "should_epoch_stop": false,
3734
+ "should_evaluate": false,
3735
+ "should_log": false,
3736
+ "should_save": true,
3737
+ "should_training_stop": false
3738
+ },
3739
+ "attributes": {}
3740
+ }
3741
+ },
3742
+ "total_flos": 2.971810321620009e+18,
3743
+ "train_batch_size": 2,
3744
+ "trial_name": null,
3745
+ "trial_params": null
3746
+ }
checkpoint-520/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b0615dd190e3cad06c1c45c1f3f829927e89a7869d28d503cafc3ceb762da2c
3
+ size 10808
checkpoint-520/vocab.json ADDED
The diff for this file is too large to render. See raw diff