KotshinZ commited on
Commit
d6ed45d
·
verified ·
1 Parent(s): 54d0b71

Model save

Browse files
Files changed (3) hide show
  1. all_results.json +5 -5
  2. train_results.json +5 -5
  3. trainer_state.json +785 -25
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 1.279280082405294e+17,
3
- "train_loss": 0.5150357365608216,
4
- "train_runtime": 1208.0268,
5
  "train_samples": 15921,
6
- "train_samples_per_second": 0.795,
7
- "train_steps_per_second": 0.004
8
  }
 
1
  {
2
+ "total_flos": 2.496956368699785e+18,
3
+ "train_loss": 0.33275703743100166,
4
+ "train_runtime": 21613.1094,
5
  "train_samples": 15921,
6
+ "train_samples_per_second": 0.863,
7
+ "train_steps_per_second": 0.005
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 1.279280082405294e+17,
3
- "train_loss": 0.5150357365608216,
4
- "train_runtime": 1208.0268,
5
  "train_samples": 15921,
6
- "train_samples_per_second": 0.795,
7
- "train_steps_per_second": 0.004
8
  }
 
1
  {
2
+ "total_flos": 2.496956368699785e+18,
3
+ "train_loss": 0.33275703743100166,
4
+ "train_runtime": 21613.1094,
5
  "train_samples": 15921,
6
+ "train_samples_per_second": 0.863,
7
+ "train_steps_per_second": 0.005
8
  }
trainer_state.json CHANGED
@@ -2,16 +2,16 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.2564102564102564,
6
  "eval_steps": 500,
7
- "global_step": 5,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.05128205128205128,
14
- "grad_norm": 2.7185363699514093,
15
  "learning_rate": 0.0,
16
  "loss": 0.5757,
17
  "num_tokens": 6240601.0,
@@ -19,51 +19,811 @@
19
  },
20
  {
21
  "epoch": 0.10256410256410256,
22
- "grad_norm": 2.7385019113690348,
23
- "learning_rate": 4e-05,
24
  "loss": 0.5766,
25
  "num_tokens": 12512622.0,
26
  "step": 2
27
  },
28
  {
29
  "epoch": 0.15384615384615385,
30
- "grad_norm": 2.3932904803502724,
31
- "learning_rate": 3.472792206135786e-05,
32
- "loss": 0.4872,
33
  "num_tokens": 18771271.0,
34
  "step": 3
35
  },
36
  {
37
  "epoch": 0.20512820512820512,
38
- "grad_norm": 1.998212103285214,
39
- "learning_rate": 2.2000000000000003e-05,
40
- "loss": 0.475,
41
  "num_tokens": 25037339.0,
42
  "step": 4
43
  },
44
  {
45
  "epoch": 0.2564102564102564,
46
- "grad_norm": 0.7904919909169668,
47
- "learning_rate": 9.272077938642147e-06,
48
- "loss": 0.4606,
49
  "num_tokens": 31263584.0,
50
  "step": 5
51
  },
52
  {
53
- "epoch": 0.2564102564102564,
54
- "step": 5,
55
- "total_flos": 1.279280082405294e+17,
56
- "train_loss": 0.5150357365608216,
57
- "train_runtime": 1208.0268,
58
- "train_samples_per_second": 0.795,
59
- "train_steps_per_second": 0.004
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  }
61
  ],
62
  "logging_steps": 1,
63
- "max_steps": 5,
64
  "num_input_tokens_seen": 0,
65
- "num_train_epochs": 1,
66
- "save_steps": 500,
67
  "stateful_callbacks": {
68
  "TrainerControl": {
69
  "args": {
@@ -76,7 +836,7 @@
76
  "attributes": {}
77
  }
78
  },
79
- "total_flos": 1.279280082405294e+17,
80
  "train_batch_size": 1,
81
  "trial_name": null,
82
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 5.0,
6
  "eval_steps": 500,
7
+ "global_step": 100,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.05128205128205128,
14
+ "grad_norm": 2.718615561225469,
15
  "learning_rate": 0.0,
16
  "loss": 0.5757,
17
  "num_tokens": 6240601.0,
 
19
  },
20
  {
21
  "epoch": 0.10256410256410256,
22
+ "grad_norm": 2.7384427318262325,
23
+ "learning_rate": 1.3333333333333333e-05,
24
  "loss": 0.5766,
25
  "num_tokens": 12512622.0,
26
  "step": 2
27
  },
28
  {
29
  "epoch": 0.15384615384615385,
30
+ "grad_norm": 1.9843963750520588,
31
+ "learning_rate": 2.6666666666666667e-05,
32
+ "loss": 0.5242,
33
  "num_tokens": 18771271.0,
34
  "step": 3
35
  },
36
  {
37
  "epoch": 0.20512820512820512,
38
+ "grad_norm": 2.12777357229112,
39
+ "learning_rate": 4e-05,
40
+ "loss": 0.4917,
41
  "num_tokens": 25037339.0,
42
  "step": 4
43
  },
44
  {
45
  "epoch": 0.2564102564102564,
46
+ "grad_norm": 2.062032488995502,
47
+ "learning_rate": 3.9990560242819274e-05,
48
+ "loss": 0.4871,
49
  "num_tokens": 31263584.0,
50
  "step": 5
51
  },
52
  {
53
+ "epoch": 0.3076923076923077,
54
+ "grad_norm": 0.7945484310641507,
55
+ "learning_rate": 3.996225087227881e-05,
56
+ "loss": 0.4538,
57
+ "num_tokens": 37544028.0,
58
+ "step": 6
59
+ },
60
+ {
61
+ "epoch": 0.358974358974359,
62
+ "grad_norm": 0.860696592609233,
63
+ "learning_rate": 3.991510158099905e-05,
64
+ "loss": 0.4585,
65
+ "num_tokens": 43782393.0,
66
+ "step": 7
67
+ },
68
+ {
69
+ "epoch": 0.41025641025641024,
70
+ "grad_norm": 0.505429642395346,
71
+ "learning_rate": 3.9849161822075655e-05,
72
+ "loss": 0.4423,
73
+ "num_tokens": 50063870.0,
74
+ "step": 8
75
+ },
76
+ {
77
+ "epoch": 0.46153846153846156,
78
+ "grad_norm": 0.7741144627099796,
79
+ "learning_rate": 3.976450075721003e-05,
80
+ "loss": 0.4439,
81
+ "num_tokens": 56308984.0,
82
+ "step": 9
83
+ },
84
+ {
85
+ "epoch": 0.5128205128205128,
86
+ "grad_norm": 0.40862829974318693,
87
+ "learning_rate": 3.9661207184168305e-05,
88
+ "loss": 0.4234,
89
+ "num_tokens": 62545986.0,
90
+ "step": 10
91
+ },
92
+ {
93
+ "epoch": 0.5641025641025641,
94
+ "grad_norm": 0.4739349747902679,
95
+ "learning_rate": 3.953938944364467e-05,
96
+ "loss": 0.4171,
97
+ "num_tokens": 68774917.0,
98
+ "step": 11
99
+ },
100
+ {
101
+ "epoch": 0.6153846153846154,
102
+ "grad_norm": 0.36916258563551035,
103
+ "learning_rate": 3.939917530562701e-05,
104
+ "loss": 0.4182,
105
+ "num_tokens": 75056519.0,
106
+ "step": 12
107
+ },
108
+ {
109
+ "epoch": 0.6666666666666666,
110
+ "grad_norm": 0.33745730043472655,
111
+ "learning_rate": 3.9240711835383766e-05,
112
+ "loss": 0.3968,
113
+ "num_tokens": 81319039.0,
114
+ "step": 13
115
+ },
116
+ {
117
+ "epoch": 0.717948717948718,
118
+ "grad_norm": 0.3682996169134089,
119
+ "learning_rate": 3.9064165239212874e-05,
120
+ "loss": 0.4167,
121
+ "num_tokens": 87572805.0,
122
+ "step": 14
123
+ },
124
+ {
125
+ "epoch": 0.7692307692307693,
126
+ "grad_norm": 0.2610361232979272,
127
+ "learning_rate": 3.886972069011419e-05,
128
+ "loss": 0.4076,
129
+ "num_tokens": 93846455.0,
130
+ "step": 15
131
+ },
132
+ {
133
+ "epoch": 0.8205128205128205,
134
+ "grad_norm": 0.2574923493976134,
135
+ "learning_rate": 3.865758213356868e-05,
136
+ "loss": 0.3931,
137
+ "num_tokens": 100128858.0,
138
+ "step": 16
139
+ },
140
+ {
141
+ "epoch": 0.8717948717948718,
142
+ "grad_norm": 0.25582043047449654,
143
+ "learning_rate": 3.8427972073627724e-05,
144
+ "loss": 0.4034,
145
+ "num_tokens": 106382537.0,
146
+ "step": 17
147
+ },
148
+ {
149
+ "epoch": 0.9230769230769231,
150
+ "grad_norm": 0.22629074274698618,
151
+ "learning_rate": 3.818113133953712e-05,
152
+ "loss": 0.3938,
153
+ "num_tokens": 112641128.0,
154
+ "step": 18
155
+ },
156
+ {
157
+ "epoch": 0.9743589743589743,
158
+ "grad_norm": 0.1995507432503149,
159
+ "learning_rate": 3.791731883314043e-05,
160
+ "loss": 0.4007,
161
+ "num_tokens": 118920817.0,
162
+ "step": 19
163
+ },
164
+ {
165
+ "epoch": 1.0,
166
+ "grad_norm": 0.1995507432503149,
167
+ "learning_rate": 3.763681125732672e-05,
168
+ "loss": 0.3794,
169
+ "num_tokens": 120361037.0,
170
+ "step": 20
171
+ },
172
+ {
173
+ "epoch": 1.0512820512820513,
174
+ "grad_norm": 0.34048667122818044,
175
+ "learning_rate": 3.733990282580745e-05,
176
+ "loss": 0.3631,
177
+ "num_tokens": 126640793.0,
178
+ "step": 21
179
+ },
180
+ {
181
+ "epoch": 1.1025641025641026,
182
+ "grad_norm": 0.19615438837401894,
183
+ "learning_rate": 3.7026904954526884e-05,
184
+ "loss": 0.3736,
185
+ "num_tokens": 132891349.0,
186
+ "step": 22
187
+ },
188
+ {
189
+ "epoch": 1.1538461538461537,
190
+ "grad_norm": 0.16644896220477234,
191
+ "learning_rate": 3.6698145935029794e-05,
192
+ "loss": 0.3773,
193
+ "num_tokens": 139140959.0,
194
+ "step": 23
195
+ },
196
+ {
197
+ "epoch": 1.205128205128205,
198
+ "grad_norm": 0.22809017244958504,
199
+ "learning_rate": 3.6353970590128975e-05,
200
+ "loss": 0.3763,
201
+ "num_tokens": 145391352.0,
202
+ "step": 24
203
+ },
204
+ {
205
+ "epoch": 1.2564102564102564,
206
+ "grad_norm": 0.16923970327686066,
207
+ "learning_rate": 3.599473991223369e-05,
208
+ "loss": 0.3644,
209
+ "num_tokens": 151670841.0,
210
+ "step": 25
211
+ },
212
+ {
213
+ "epoch": 1.3076923076923077,
214
+ "grad_norm": 0.17762206878485046,
215
+ "learning_rate": 3.5620830684718515e-05,
216
+ "loss": 0.3614,
217
+ "num_tokens": 157951707.0,
218
+ "step": 26
219
+ },
220
+ {
221
+ "epoch": 1.358974358974359,
222
+ "grad_norm": 0.17590149731091576,
223
+ "learning_rate": 3.523263508672961e-05,
224
+ "loss": 0.3678,
225
+ "num_tokens": 164210738.0,
226
+ "step": 27
227
+ },
228
+ {
229
+ "epoch": 1.4102564102564101,
230
+ "grad_norm": 0.1385680492384403,
231
+ "learning_rate": 3.483056028184293e-05,
232
+ "loss": 0.3636,
233
+ "num_tokens": 170465186.0,
234
+ "step": 28
235
+ },
236
+ {
237
+ "epoch": 1.4615384615384617,
238
+ "grad_norm": 0.15559805698049264,
239
+ "learning_rate": 3.441502799100588e-05,
240
+ "loss": 0.3728,
241
+ "num_tokens": 176747206.0,
242
+ "step": 29
243
+ },
244
+ {
245
+ "epoch": 1.5128205128205128,
246
+ "grad_norm": 0.14563346219653364,
247
+ "learning_rate": 3.398647405021026e-05,
248
+ "loss": 0.3653,
249
+ "num_tokens": 183011405.0,
250
+ "step": 30
251
+ },
252
+ {
253
+ "epoch": 1.564102564102564,
254
+ "grad_norm": 0.13958012967685854,
255
+ "learning_rate": 3.354534795336052e-05,
256
+ "loss": 0.3701,
257
+ "num_tokens": 189245121.0,
258
+ "step": 31
259
+ },
260
+ {
261
+ "epoch": 1.6153846153846154,
262
+ "grad_norm": 0.11987296091737516,
263
+ "learning_rate": 3.3092112380816696e-05,
264
+ "loss": 0.3543,
265
+ "num_tokens": 195525140.0,
266
+ "step": 32
267
+ },
268
+ {
269
+ "epoch": 1.6666666666666665,
270
+ "grad_norm": 0.14628950900383383,
271
+ "learning_rate": 3.262724271410661e-05,
272
+ "loss": 0.3564,
273
+ "num_tokens": 201755990.0,
274
+ "step": 33
275
+ },
276
+ {
277
+ "epoch": 1.717948717948718,
278
+ "grad_norm": 0.10624435308415683,
279
+ "learning_rate": 3.2151226537316315e-05,
280
+ "loss": 0.3555,
281
+ "num_tokens": 208001091.0,
282
+ "step": 34
283
+ },
284
+ {
285
+ "epoch": 1.7692307692307692,
286
+ "grad_norm": 0.13312768270680841,
287
+ "learning_rate": 3.166456312568171e-05,
288
+ "loss": 0.367,
289
+ "num_tokens": 214269555.0,
290
+ "step": 35
291
+ },
292
+ {
293
+ "epoch": 1.8205128205128205,
294
+ "grad_norm": 0.11882236522455852,
295
+ "learning_rate": 3.116776292191774e-05,
296
+ "loss": 0.3492,
297
+ "num_tokens": 220504752.0,
298
+ "step": 36
299
+ },
300
+ {
301
+ "epoch": 1.8717948717948718,
302
+ "grad_norm": 0.1108379812394889,
303
+ "learning_rate": 3.0661347000834496e-05,
304
+ "loss": 0.3603,
305
+ "num_tokens": 226746431.0,
306
+ "step": 37
307
+ },
308
+ {
309
+ "epoch": 1.9230769230769231,
310
+ "grad_norm": 0.1209856619633538,
311
+ "learning_rate": 3.0145846522801703e-05,
312
+ "loss": 0.364,
313
+ "num_tokens": 233000985.0,
314
+ "step": 38
315
+ },
316
+ {
317
+ "epoch": 1.9743589743589745,
318
+ "grad_norm": 0.11419043533635012,
319
+ "learning_rate": 2.962180217663483e-05,
320
+ "loss": 0.3555,
321
+ "num_tokens": 239282562.0,
322
+ "step": 39
323
+ },
324
+ {
325
+ "epoch": 2.0,
326
+ "grad_norm": 0.18167550021785242,
327
+ "learning_rate": 2.908976361248717e-05,
328
+ "loss": 0.3489,
329
+ "num_tokens": 240721535.0,
330
+ "step": 40
331
+ },
332
+ {
333
+ "epoch": 2.051282051282051,
334
+ "grad_norm": 0.15212860008317705,
335
+ "learning_rate": 2.855028886534278e-05,
336
+ "loss": 0.3218,
337
+ "num_tokens": 246968297.0,
338
+ "step": 41
339
+ },
340
+ {
341
+ "epoch": 2.1025641025641026,
342
+ "grad_norm": 0.1298804530561056,
343
+ "learning_rate": 2.8003943769714776e-05,
344
+ "loss": 0.3299,
345
+ "num_tokens": 253234954.0,
346
+ "step": 42
347
+ },
348
+ {
349
+ "epoch": 2.1538461538461537,
350
+ "grad_norm": 0.1405980356910723,
351
+ "learning_rate": 2.7451301366163116e-05,
352
+ "loss": 0.3196,
353
+ "num_tokens": 259516218.0,
354
+ "step": 43
355
+ },
356
+ {
357
+ "epoch": 2.2051282051282053,
358
+ "grad_norm": 0.10364248867045861,
359
+ "learning_rate": 2.6892941300254176e-05,
360
+ "loss": 0.3287,
361
+ "num_tokens": 265777892.0,
362
+ "step": 44
363
+ },
364
+ {
365
+ "epoch": 2.2564102564102564,
366
+ "grad_norm": 0.1283238256584598,
367
+ "learning_rate": 2.6329449214592568e-05,
368
+ "loss": 0.3268,
369
+ "num_tokens": 272038252.0,
370
+ "step": 45
371
+ },
372
+ {
373
+ "epoch": 2.3076923076923075,
374
+ "grad_norm": 0.09763592234171578,
375
+ "learning_rate": 2.5761416134562955e-05,
376
+ "loss": 0.3126,
377
+ "num_tokens": 278296569.0,
378
+ "step": 46
379
+ },
380
+ {
381
+ "epoch": 2.358974358974359,
382
+ "grad_norm": 0.11539957768743261,
383
+ "learning_rate": 2.5189437848426016e-05,
384
+ "loss": 0.3241,
385
+ "num_tokens": 284513318.0,
386
+ "step": 47
387
+ },
388
+ {
389
+ "epoch": 2.41025641025641,
390
+ "grad_norm": 0.10255646114610974,
391
+ "learning_rate": 2.461411428241883e-05,
392
+ "loss": 0.313,
393
+ "num_tokens": 290793851.0,
394
+ "step": 48
395
+ },
396
+ {
397
+ "epoch": 2.4615384615384617,
398
+ "grad_norm": 0.11056138494167911,
399
+ "learning_rate": 2.403604887151512e-05,
400
+ "loss": 0.3142,
401
+ "num_tokens": 297038281.0,
402
+ "step": 49
403
+ },
404
+ {
405
+ "epoch": 2.5128205128205128,
406
+ "grad_norm": 0.09087426235660283,
407
+ "learning_rate": 2.3455847926505283e-05,
408
+ "loss": 0.3237,
409
+ "num_tokens": 303309073.0,
410
+ "step": 50
411
+ },
412
+ {
413
+ "epoch": 2.564102564102564,
414
+ "grad_norm": 0.09942623723641979,
415
+ "learning_rate": 2.287411999806007e-05,
416
+ "loss": 0.31,
417
+ "num_tokens": 309590829.0,
418
+ "step": 51
419
+ },
420
+ {
421
+ "epoch": 2.6153846153846154,
422
+ "grad_norm": 0.0814248146482854,
423
+ "learning_rate": 2.2291475238445033e-05,
424
+ "loss": 0.3211,
425
+ "num_tokens": 315858116.0,
426
+ "step": 52
427
+ },
428
+ {
429
+ "epoch": 2.6666666666666665,
430
+ "grad_norm": 0.09568614448485548,
431
+ "learning_rate": 2.1708524761554973e-05,
432
+ "loss": 0.3177,
433
+ "num_tokens": 322118809.0,
434
+ "step": 53
435
+ },
436
+ {
437
+ "epoch": 2.717948717948718,
438
+ "grad_norm": 0.07964923658403533,
439
+ "learning_rate": 2.112588000193994e-05,
440
+ "loss": 0.319,
441
+ "num_tokens": 328400995.0,
442
+ "step": 54
443
+ },
444
+ {
445
+ "epoch": 2.769230769230769,
446
+ "grad_norm": 0.08309011179734216,
447
+ "learning_rate": 2.054415207349473e-05,
448
+ "loss": 0.3203,
449
+ "num_tokens": 334639597.0,
450
+ "step": 55
451
+ },
452
+ {
453
+ "epoch": 2.8205128205128203,
454
+ "grad_norm": 0.08196424816532498,
455
+ "learning_rate": 1.9963951128484886e-05,
456
+ "loss": 0.3221,
457
+ "num_tokens": 340841925.0,
458
+ "step": 56
459
+ },
460
+ {
461
+ "epoch": 2.871794871794872,
462
+ "grad_norm": 0.07847272582125499,
463
+ "learning_rate": 1.9385885717581182e-05,
464
+ "loss": 0.3133,
465
+ "num_tokens": 347123519.0,
466
+ "step": 57
467
+ },
468
+ {
469
+ "epoch": 2.9230769230769234,
470
+ "grad_norm": 0.07759377817882507,
471
+ "learning_rate": 1.8810562151573993e-05,
472
+ "loss": 0.3039,
473
+ "num_tokens": 353396517.0,
474
+ "step": 58
475
+ },
476
+ {
477
+ "epoch": 2.9743589743589745,
478
+ "grad_norm": 0.08098128127962194,
479
+ "learning_rate": 1.823858386543705e-05,
480
+ "loss": 0.3097,
481
+ "num_tokens": 359678782.0,
482
+ "step": 59
483
+ },
484
+ {
485
+ "epoch": 3.0,
486
+ "grad_norm": 0.08098128127962194,
487
+ "learning_rate": 1.7670550785407444e-05,
488
+ "loss": 0.311,
489
+ "num_tokens": 361081292.0,
490
+ "step": 60
491
+ },
492
+ {
493
+ "epoch": 3.051282051282051,
494
+ "grad_norm": 0.1219526450192946,
495
+ "learning_rate": 1.710705869974583e-05,
496
+ "loss": 0.2909,
497
+ "num_tokens": 367363606.0,
498
+ "step": 61
499
+ },
500
+ {
501
+ "epoch": 3.1025641025641026,
502
+ "grad_norm": 0.09709390348755251,
503
+ "learning_rate": 1.6548698633836893e-05,
504
+ "loss": 0.2929,
505
+ "num_tokens": 373623626.0,
506
+ "step": 62
507
+ },
508
+ {
509
+ "epoch": 3.1538461538461537,
510
+ "grad_norm": 0.09032197466642125,
511
+ "learning_rate": 1.5996056230285237e-05,
512
+ "loss": 0.2881,
513
+ "num_tokens": 379905390.0,
514
+ "step": 63
515
+ },
516
+ {
517
+ "epoch": 3.2051282051282053,
518
+ "grad_norm": 0.09593201487472482,
519
+ "learning_rate": 1.5449711134657224e-05,
520
+ "loss": 0.2849,
521
+ "num_tokens": 386151857.0,
522
+ "step": 64
523
+ },
524
+ {
525
+ "epoch": 3.2564102564102564,
526
+ "grad_norm": 0.08482876184821657,
527
+ "learning_rate": 1.4910236387512837e-05,
528
+ "loss": 0.2849,
529
+ "num_tokens": 392391728.0,
530
+ "step": 65
531
+ },
532
+ {
533
+ "epoch": 3.3076923076923075,
534
+ "grad_norm": 0.08786917404760929,
535
+ "learning_rate": 1.4378197823365186e-05,
536
+ "loss": 0.2919,
537
+ "num_tokens": 398635444.0,
538
+ "step": 66
539
+ },
540
+ {
541
+ "epoch": 3.358974358974359,
542
+ "grad_norm": 0.09124789781748067,
543
+ "learning_rate": 1.3854153477198305e-05,
544
+ "loss": 0.2877,
545
+ "num_tokens": 404884484.0,
546
+ "step": 67
547
+ },
548
+ {
549
+ "epoch": 3.41025641025641,
550
+ "grad_norm": 0.08376420028878678,
551
+ "learning_rate": 1.3338652999165511e-05,
552
+ "loss": 0.2787,
553
+ "num_tokens": 411165857.0,
554
+ "step": 68
555
+ },
556
+ {
557
+ "epoch": 3.4615384615384617,
558
+ "grad_norm": 0.08461395911230184,
559
+ "learning_rate": 1.2832237078082272e-05,
560
+ "loss": 0.2874,
561
+ "num_tokens": 417447084.0,
562
+ "step": 69
563
+ },
564
+ {
565
+ "epoch": 3.5128205128205128,
566
+ "grad_norm": 0.0805797686999939,
567
+ "learning_rate": 1.2335436874318293e-05,
568
+ "loss": 0.2868,
569
+ "num_tokens": 423708369.0,
570
+ "step": 70
571
+ },
572
+ {
573
+ "epoch": 3.564102564102564,
574
+ "grad_norm": 0.0812369216368886,
575
+ "learning_rate": 1.1848773462683684e-05,
576
+ "loss": 0.2731,
577
+ "num_tokens": 429917854.0,
578
+ "step": 71
579
+ },
580
+ {
581
+ "epoch": 3.6153846153846154,
582
+ "grad_norm": 0.08068456709857151,
583
+ "learning_rate": 1.13727572858934e-05,
584
+ "loss": 0.2766,
585
+ "num_tokens": 436166148.0,
586
+ "step": 72
587
+ },
588
+ {
589
+ "epoch": 3.6666666666666665,
590
+ "grad_norm": 0.07330776888938495,
591
+ "learning_rate": 1.0907887619183308e-05,
592
+ "loss": 0.2815,
593
+ "num_tokens": 442404416.0,
594
+ "step": 73
595
+ },
596
+ {
597
+ "epoch": 3.717948717948718,
598
+ "grad_norm": 0.07750844403333255,
599
+ "learning_rate": 1.0454652046639486e-05,
600
+ "loss": 0.2765,
601
+ "num_tokens": 448671262.0,
602
+ "step": 74
603
+ },
604
+ {
605
+ "epoch": 3.769230769230769,
606
+ "grad_norm": 0.06910018245510882,
607
+ "learning_rate": 1.0013525949789745e-05,
608
+ "loss": 0.2765,
609
+ "num_tokens": 454908778.0,
610
+ "step": 75
611
+ },
612
+ {
613
+ "epoch": 3.8205128205128203,
614
+ "grad_norm": 0.06772004012970233,
615
+ "learning_rate": 9.584972008994123e-06,
616
+ "loss": 0.2761,
617
+ "num_tokens": 461176026.0,
618
+ "step": 76
619
+ },
620
+ {
621
+ "epoch": 3.871794871794872,
622
+ "grad_norm": 0.06449052728723467,
623
+ "learning_rate": 9.16943971815708e-06,
624
+ "loss": 0.2718,
625
+ "num_tokens": 467459020.0,
626
+ "step": 77
627
+ },
628
+ {
629
+ "epoch": 3.9230769230769234,
630
+ "grad_norm": 0.06591642004614524,
631
+ "learning_rate": 8.767364913270399e-06,
632
+ "loss": 0.2686,
633
+ "num_tokens": 473721944.0,
634
+ "step": 78
635
+ },
636
+ {
637
+ "epoch": 3.9743589743589745,
638
+ "grad_norm": 0.05776576407058006,
639
+ "learning_rate": 8.379169315281485e-06,
640
+ "loss": 0.2763,
641
+ "num_tokens": 480002563.0,
642
+ "step": 79
643
+ },
644
+ {
645
+ "epoch": 4.0,
646
+ "grad_norm": 0.13745363566927554,
647
+ "learning_rate": 8.005260087766318e-06,
648
+ "loss": 0.268,
649
+ "num_tokens": 481441850.0,
650
+ "step": 80
651
+ },
652
+ {
653
+ "epoch": 4.051282051282051,
654
+ "grad_norm": 0.09534028423404649,
655
+ "learning_rate": 7.646029409871029e-06,
656
+ "loss": 0.2577,
657
+ "num_tokens": 487660172.0,
658
+ "step": 81
659
+ },
660
+ {
661
+ "epoch": 4.102564102564102,
662
+ "grad_norm": 0.06997274489921491,
663
+ "learning_rate": 7.301854064970202e-06,
664
+ "loss": 0.2557,
665
+ "num_tokens": 493912073.0,
666
+ "step": 82
667
+ },
668
+ {
669
+ "epoch": 4.153846153846154,
670
+ "grad_norm": 0.08755702656310224,
671
+ "learning_rate": 6.973095045473124e-06,
672
+ "loss": 0.2604,
673
+ "num_tokens": 500168012.0,
674
+ "step": 83
675
+ },
676
+ {
677
+ "epoch": 4.205128205128205,
678
+ "grad_norm": 0.08888143272022463,
679
+ "learning_rate": 6.660097174192556e-06,
680
+ "loss": 0.2629,
681
+ "num_tokens": 506448923.0,
682
+ "step": 84
683
+ },
684
+ {
685
+ "epoch": 4.256410256410256,
686
+ "grad_norm": 0.06765618005674934,
687
+ "learning_rate": 6.363188742673281e-06,
688
+ "loss": 0.2597,
689
+ "num_tokens": 512697543.0,
690
+ "step": 85
691
+ },
692
+ {
693
+ "epoch": 4.3076923076923075,
694
+ "grad_norm": 0.08077914878872741,
695
+ "learning_rate": 6.082681166859579e-06,
696
+ "loss": 0.2565,
697
+ "num_tokens": 518952837.0,
698
+ "step": 86
699
+ },
700
+ {
701
+ "epoch": 4.358974358974359,
702
+ "grad_norm": 0.07580797752338386,
703
+ "learning_rate": 5.818868660462886e-06,
704
+ "loss": 0.2622,
705
+ "num_tokens": 525234697.0,
706
+ "step": 87
707
+ },
708
+ {
709
+ "epoch": 4.410256410256411,
710
+ "grad_norm": 0.06618677478470034,
711
+ "learning_rate": 5.5720279263722795e-06,
712
+ "loss": 0.2571,
713
+ "num_tokens": 531495708.0,
714
+ "step": 88
715
+ },
716
+ {
717
+ "epoch": 4.461538461538462,
718
+ "grad_norm": 0.0680578770532582,
719
+ "learning_rate": 5.342417866431326e-06,
720
+ "loss": 0.2583,
721
+ "num_tokens": 537744492.0,
722
+ "step": 89
723
+ },
724
+ {
725
+ "epoch": 4.512820512820513,
726
+ "grad_norm": 0.0730363106871515,
727
+ "learning_rate": 5.130279309885817e-06,
728
+ "loss": 0.2523,
729
+ "num_tokens": 544012162.0,
730
+ "step": 90
731
+ },
732
+ {
733
+ "epoch": 4.564102564102564,
734
+ "grad_norm": 0.06548896520491608,
735
+ "learning_rate": 4.935834760787133e-06,
736
+ "loss": 0.2556,
737
+ "num_tokens": 550280847.0,
738
+ "step": 91
739
+ },
740
+ {
741
+ "epoch": 4.615384615384615,
742
+ "grad_norm": 0.06057215331898529,
743
+ "learning_rate": 4.7592881646162336e-06,
744
+ "loss": 0.2534,
745
+ "num_tokens": 556540016.0,
746
+ "step": 92
747
+ },
748
+ {
749
+ "epoch": 4.666666666666667,
750
+ "grad_norm": 0.0660973563878409,
751
+ "learning_rate": 4.600824694373e-06,
752
+ "loss": 0.259,
753
+ "num_tokens": 562819807.0,
754
+ "step": 93
755
+ },
756
+ {
757
+ "epoch": 4.717948717948718,
758
+ "grad_norm": 0.06127804475677186,
759
+ "learning_rate": 4.460610556355333e-06,
760
+ "loss": 0.2516,
761
+ "num_tokens": 569101952.0,
762
+ "step": 94
763
+ },
764
+ {
765
+ "epoch": 4.769230769230769,
766
+ "grad_norm": 0.06055474487432341,
767
+ "learning_rate": 4.338792815831698e-06,
768
+ "loss": 0.2496,
769
+ "num_tokens": 575336966.0,
770
+ "step": 95
771
+ },
772
+ {
773
+ "epoch": 4.82051282051282,
774
+ "grad_norm": 0.05900101598712522,
775
+ "learning_rate": 4.2354992427899674e-06,
776
+ "loss": 0.2525,
777
+ "num_tokens": 581620237.0,
778
+ "step": 96
779
+ },
780
+ {
781
+ "epoch": 4.871794871794872,
782
+ "grad_norm": 0.06942816751525951,
783
+ "learning_rate": 4.150838177924349e-06,
784
+ "loss": 0.2589,
785
+ "num_tokens": 587875142.0,
786
+ "step": 97
787
+ },
788
+ {
789
+ "epoch": 4.923076923076923,
790
+ "grad_norm": 0.06137215301815073,
791
+ "learning_rate": 4.0848984190009495e-06,
792
+ "loss": 0.2616,
793
+ "num_tokens": 594142883.0,
794
+ "step": 98
795
+ },
796
+ {
797
+ "epoch": 4.9743589743589745,
798
+ "grad_norm": 0.06146449383500362,
799
+ "learning_rate": 4.037749127721191e-06,
800
+ "loss": 0.2508,
801
+ "num_tokens": 600408547.0,
802
+ "step": 99
803
+ },
804
+ {
805
+ "epoch": 5.0,
806
+ "grad_norm": 0.06146449383500362,
807
+ "learning_rate": 4.009439757180732e-06,
808
+ "loss": 0.247,
809
+ "num_tokens": 601802442.0,
810
+ "step": 100
811
+ },
812
+ {
813
+ "epoch": 5.0,
814
+ "step": 100,
815
+ "total_flos": 2.496956368699785e+18,
816
+ "train_loss": 0.33275703743100166,
817
+ "train_runtime": 21613.1094,
818
+ "train_samples_per_second": 0.863,
819
+ "train_steps_per_second": 0.005
820
  }
821
  ],
822
  "logging_steps": 1,
823
+ "max_steps": 100,
824
  "num_input_tokens_seen": 0,
825
+ "num_train_epochs": 5,
826
+ "save_steps": 5,
827
  "stateful_callbacks": {
828
  "TrainerControl": {
829
  "args": {
 
836
  "attributes": {}
837
  }
838
  },
839
+ "total_flos": 2.496956368699785e+18,
840
  "train_batch_size": 1,
841
  "trial_name": null,
842
  "trial_params": null