minghaowu commited on
Commit
756b61d
·
verified ·
1 Parent(s): f1d25be

Training in progress, step 300, checkpoint

Browse files
last-checkpoint/global_step300/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35376801879c11c2adf0342846cbdbd18018dfbefe5636ecddbb4986e6a4b9bc
3
+ size 7414895232
last-checkpoint/global_step300/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22493093364c2d8775f9fa791603683a41894a7681fb268a415cbfdb9baa0802
3
+ size 7414897472
last-checkpoint/global_step300/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:869b023df6a1b3b6dd52e712b227e9f9b058a1fa0deb100e1ceb528e4c384879
3
+ size 2471673464
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step200
 
1
+ global_step300
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:39b72fde5d7858e6afce8db4b0d0fdeca00d87ecb6c744603b12d525c98ccf36
3
  size 2996982344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a333a1e7d62732487a539671fb04f1a32f072d93c99b9b0315f5d99543bf0954
3
  size 2996982344
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:66c5a0df19b9c4c7f9628533d87b5e767121c5b1c20697fb2cfc5c745c752a6a
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac7985cc855bc8c48442e153af46fa17bf79f585cbe447e55ec06b7117fe3b7a
3
  size 14512
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:59aedc175259cd5edebf90ddc9a5fcde025d3ccce2f0eca359a9ff56cba98147
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5db25d30873a15ccd0be3a615086c3e7e9acef4fdd0089808ea03170471e881
3
  size 14512
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e40935651363f2a1063f3f036a3600d22c7ab6431c4f31c42b100e6e12d0544e
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26eca4504b1c2e2d8a795c4c11dfa892a9cb1f738443ece8d5cf3596820e3480
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.4365186182144458,
5
  "eval_steps": 500,
6
- "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -147,6 +147,76 @@
147
  "learning_rate": 5.7866666666666674e-06,
148
  "loss": 0.4665,
149
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  }
151
  ],
152
  "logging_steps": 10,
@@ -166,7 +236,7 @@
166
  "attributes": {}
167
  }
168
  },
169
- "total_flos": 5.3571925946925056e+17,
170
  "train_batch_size": 14,
171
  "trial_name": null,
172
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.155226558995065,
5
  "eval_steps": 500,
6
+ "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
147
  "learning_rate": 5.7866666666666674e-06,
148
  "loss": 0.4665,
149
  "step": 200
150
+ },
151
+ {
152
+ "epoch": 1.5082996859578286,
153
+ "grad_norm": 0.020146360620856285,
154
+ "learning_rate": 5.5200000000000005e-06,
155
+ "loss": 0.4627,
156
+ "step": 210
157
+ },
158
+ {
159
+ "epoch": 1.5800807537012114,
160
+ "grad_norm": 0.020714716985821724,
161
+ "learning_rate": 5.2533333333333336e-06,
162
+ "loss": 0.4609,
163
+ "step": 220
164
+ },
165
+ {
166
+ "epoch": 1.651861821444594,
167
+ "grad_norm": 0.020331306383013725,
168
+ "learning_rate": 4.986666666666667e-06,
169
+ "loss": 0.458,
170
+ "step": 230
171
+ },
172
+ {
173
+ "epoch": 1.7236428891879767,
174
+ "grad_norm": 0.019636554643511772,
175
+ "learning_rate": 4.7200000000000005e-06,
176
+ "loss": 0.4559,
177
+ "step": 240
178
+ },
179
+ {
180
+ "epoch": 1.7954239569313595,
181
+ "grad_norm": 0.020189929753541946,
182
+ "learning_rate": 4.453333333333334e-06,
183
+ "loss": 0.454,
184
+ "step": 250
185
+ },
186
+ {
187
+ "epoch": 1.867205024674742,
188
+ "grad_norm": 0.020626794546842575,
189
+ "learning_rate": 4.1866666666666675e-06,
190
+ "loss": 0.4507,
191
+ "step": 260
192
+ },
193
+ {
194
+ "epoch": 1.9389860924181246,
195
+ "grad_norm": 0.02100289985537529,
196
+ "learning_rate": 3.920000000000001e-06,
197
+ "loss": 0.4498,
198
+ "step": 270
199
+ },
200
+ {
201
+ "epoch": 2.0116644235082997,
202
+ "grad_norm": 0.020512910559773445,
203
+ "learning_rate": 3.6533333333333336e-06,
204
+ "loss": 0.4824,
205
+ "step": 280
206
+ },
207
+ {
208
+ "epoch": 2.083445491251682,
209
+ "grad_norm": 0.02125644125044346,
210
+ "learning_rate": 3.386666666666667e-06,
211
+ "loss": 0.4383,
212
+ "step": 290
213
+ },
214
+ {
215
+ "epoch": 2.155226558995065,
216
+ "grad_norm": 0.021306023001670837,
217
+ "learning_rate": 3.12e-06,
218
+ "loss": 0.4377,
219
+ "step": 300
220
  }
221
  ],
222
  "logging_steps": 10,
 
236
  "attributes": {}
237
  }
238
  },
239
+ "total_flos": 8.035788892038758e+17,
240
  "train_batch_size": 14,
241
  "trial_name": null,
242
  "trial_params": null