crossroderick commited on
Commit
9fea118
·
1 Parent(s): 49af62d

Fourth iteration with 1.9 million training records

Browse files
Files changed (45) hide show
  1. README.md +1 -1
  2. checkpoints/checkpoint-50000/trainer_state.json +0 -734
  3. checkpoints/checkpoint-50500/trainer_state.json +0 -741
  4. checkpoints/checkpoint-50606/trainer_state.json +0 -741
  5. checkpoints/{checkpoint-50000 → checkpoint-61500}/config.json +0 -0
  6. checkpoints/{checkpoint-50000 → checkpoint-61500}/generation_config.json +0 -0
  7. checkpoints/{checkpoint-50000 → checkpoint-61500}/model.safetensors +1 -1
  8. checkpoints/{checkpoint-50500 → checkpoint-61500}/optimizer.pt +1 -1
  9. checkpoints/{checkpoint-50606 → checkpoint-61500}/rng_state.pth +1 -1
  10. checkpoints/{checkpoint-50606 → checkpoint-61500}/scaler.pt +1 -1
  11. checkpoints/{checkpoint-50000 → checkpoint-61500}/scheduler.pt +1 -1
  12. checkpoints/{checkpoint-50000 → checkpoint-61500}/special_tokens_map.json +0 -0
  13. checkpoints/{checkpoint-50000 → checkpoint-61500}/spiece.model +0 -0
  14. checkpoints/{checkpoint-50000 → checkpoint-61500}/tokenizer.json +0 -0
  15. checkpoints/{checkpoint-50000 → checkpoint-61500}/tokenizer_config.json +0 -0
  16. checkpoints/checkpoint-61500/trainer_state.json +895 -0
  17. checkpoints/{checkpoint-50000 → checkpoint-61500}/training_args.bin +0 -0
  18. checkpoints/{checkpoint-50500 → checkpoint-62000}/config.json +0 -0
  19. checkpoints/{checkpoint-50500 → checkpoint-62000}/generation_config.json +0 -0
  20. checkpoints/{checkpoint-50500 → checkpoint-62000}/model.safetensors +1 -1
  21. checkpoints/{checkpoint-50000 → checkpoint-62000}/optimizer.pt +1 -1
  22. checkpoints/{checkpoint-50500 → checkpoint-62000}/rng_state.pth +1 -1
  23. checkpoints/{checkpoint-50500 → checkpoint-62000}/scaler.pt +1 -1
  24. checkpoints/{checkpoint-50606 → checkpoint-62000}/scheduler.pt +1 -1
  25. checkpoints/{checkpoint-50500 → checkpoint-62000}/special_tokens_map.json +0 -0
  26. checkpoints/{checkpoint-50500 → checkpoint-62000}/spiece.model +0 -0
  27. checkpoints/{checkpoint-50500 → checkpoint-62000}/tokenizer.json +0 -0
  28. checkpoints/{checkpoint-50500 → checkpoint-62000}/tokenizer_config.json +0 -0
  29. checkpoints/checkpoint-62000/trainer_state.json +902 -0
  30. checkpoints/{checkpoint-50500 → checkpoint-62000}/training_args.bin +0 -0
  31. checkpoints/{checkpoint-50606 → checkpoint-62228}/config.json +0 -0
  32. checkpoints/{checkpoint-50606 → checkpoint-62228}/generation_config.json +0 -0
  33. checkpoints/{checkpoint-50606 → checkpoint-62228}/model.safetensors +1 -1
  34. checkpoints/{checkpoint-50606 → checkpoint-62228}/optimizer.pt +1 -1
  35. checkpoints/{checkpoint-50000 → checkpoint-62228}/rng_state.pth +1 -1
  36. checkpoints/{checkpoint-50000 → checkpoint-62228}/scaler.pt +1 -1
  37. checkpoints/{checkpoint-50500 → checkpoint-62228}/scheduler.pt +1 -1
  38. checkpoints/{checkpoint-50606 → checkpoint-62228}/special_tokens_map.json +0 -0
  39. checkpoints/{checkpoint-50606 → checkpoint-62228}/spiece.model +0 -0
  40. checkpoints/{checkpoint-50606 → checkpoint-62228}/tokenizer.json +0 -0
  41. checkpoints/{checkpoint-50606 → checkpoint-62228}/tokenizer_config.json +0 -0
  42. checkpoints/checkpoint-62228/trainer_state.json +902 -0
  43. checkpoints/{checkpoint-50606 → checkpoint-62228}/training_args.bin +0 -0
  44. model.safetensors +1 -1
  45. src/data/__pycache__/generate_cyr_lat_pairs.cpython-312.pyc +0 -0
README.md CHANGED
@@ -132,7 +132,7 @@ KazParC деректер жинағын жүктеп алу үшін сізге
132
 
133
  - **DalaT5 v3**: 20 сәуірде дәл реттелген, 20 сәуірде қолжетімді болды. Жаттығу үшін ~1,6 миллион деректер жазбасы пайдаланылды. Үшінші итерация одан әрі жақсартуларды, сондай-ақ белгілі бір дәрежеде семантикалық түсінуді көрсетті / Fine-tuned on April 20, made available on April 20. Used ~1.6 million data records for training. Third iteration that showed further improvements, as well as some degree of semantic understanding
134
 
135
- - **DalaT5 v4**: Нақты баптау 23 сәуірде болады және сол күні қолжетімді болады. ~1,9 миллион жазбаны пайдалануға орнату (Wikipedia dump + CC100 + KazParC) / Fine-tuning to take place on April 23 and will be made available on the same day. Set to use ~1.9 million records (Wikipedia dump + CC100 + KazParC)
136
 
137
  ---
138
 
 
132
 
133
  - **DalaT5 v3**: 20 сәуірде дәл реттелген, 20 сәуірде қолжетімді болды. Жаттығу үшін ~1,6 миллион деректер жазбасы пайдаланылды. Үшінші итерация одан әрі жақсартуларды, сондай-ақ белгілі бір дәрежеде семантикалық түсінуді көрсетті / Fine-tuned on April 20, made available on April 20. Used ~1.6 million data records for training. Third iteration that showed further improvements, as well as some degree of semantic understanding
134
 
135
+ - **DalaT5 v4**: 23 сәуірде нақтыланған, 23 сәуірде қолжетімді болды. Жаттығу үшін ~1,9 миллион жазба (Wikipedia dump + CC100 + KazParC) пайдаланылды. Семантикалық түсініктің жоғарылауын көрсететін төртінші итерация / Fine-tuned on April 23, made available on April 23. Used ~1.9 million records (Wikipedia dump + CC100 + KazParC) for training. Fourth iteration that showed increased semantic understanding.
136
 
137
  ---
138
 
checkpoints/checkpoint-50000/trainer_state.json DELETED
@@ -1,734 +0,0 @@
1
- {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 1.9760502707188872,
6
- "eval_steps": 500,
7
- "global_step": 50000,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.01976050270718887,
14
- "grad_norm": 0.40364792943000793,
15
- "learning_rate": 4.951092755799708e-05,
16
- "loss": 2.1373,
17
- "step": 500
18
- },
19
- {
20
- "epoch": 0.03952100541437774,
21
- "grad_norm": 0.5190012454986572,
22
- "learning_rate": 4.901691499031736e-05,
23
- "loss": 1.1731,
24
- "step": 1000
25
- },
26
- {
27
- "epoch": 0.059281508121566615,
28
- "grad_norm": 0.44221577048301697,
29
- "learning_rate": 4.8522902422637634e-05,
30
- "loss": 0.9673,
31
- "step": 1500
32
- },
33
- {
34
- "epoch": 0.07904201082875548,
35
- "grad_norm": 0.3572881817817688,
36
- "learning_rate": 4.8028889854957916e-05,
37
- "loss": 0.8537,
38
- "step": 2000
39
- },
40
- {
41
- "epoch": 0.09880251353594435,
42
- "grad_norm": 0.36681538820266724,
43
- "learning_rate": 4.753487728727819e-05,
44
- "loss": 0.77,
45
- "step": 2500
46
- },
47
- {
48
- "epoch": 0.11856301624313323,
49
- "grad_norm": 0.3278830647468567,
50
- "learning_rate": 4.7040864719598474e-05,
51
- "loss": 0.7379,
52
- "step": 3000
53
- },
54
- {
55
- "epoch": 0.1383235189503221,
56
- "grad_norm": 0.32520031929016113,
57
- "learning_rate": 4.654685215191875e-05,
58
- "loss": 0.6918,
59
- "step": 3500
60
- },
61
- {
62
- "epoch": 0.15808402165751095,
63
- "grad_norm": 0.3115929961204529,
64
- "learning_rate": 4.6052839584239024e-05,
65
- "loss": 0.6635,
66
- "step": 4000
67
- },
68
- {
69
- "epoch": 0.17784452436469983,
70
- "grad_norm": 0.309794157743454,
71
- "learning_rate": 4.55588270165593e-05,
72
- "loss": 0.6363,
73
- "step": 4500
74
- },
75
- {
76
- "epoch": 0.1976050270718887,
77
- "grad_norm": 0.3163657784461975,
78
- "learning_rate": 4.506481444887958e-05,
79
- "loss": 0.6113,
80
- "step": 5000
81
- },
82
- {
83
- "epoch": 0.21736552977907758,
84
- "grad_norm": 0.27902352809906006,
85
- "learning_rate": 4.457080188119986e-05,
86
- "loss": 0.5899,
87
- "step": 5500
88
- },
89
- {
90
- "epoch": 0.23712603248626646,
91
- "grad_norm": 0.26167234778404236,
92
- "learning_rate": 4.407678931352014e-05,
93
- "loss": 0.58,
94
- "step": 6000
95
- },
96
- {
97
- "epoch": 0.25688653519345533,
98
- "grad_norm": 0.30986905097961426,
99
- "learning_rate": 4.3582776745840415e-05,
100
- "loss": 0.5632,
101
- "step": 6500
102
- },
103
- {
104
- "epoch": 0.2766470379006442,
105
- "grad_norm": 0.28008660674095154,
106
- "learning_rate": 4.30887641781607e-05,
107
- "loss": 0.5478,
108
- "step": 7000
109
- },
110
- {
111
- "epoch": 0.2964075406078331,
112
- "grad_norm": 0.2860545516014099,
113
- "learning_rate": 4.259475161048097e-05,
114
- "loss": 0.541,
115
- "step": 7500
116
- },
117
- {
118
- "epoch": 0.3161680433150219,
119
- "grad_norm": 0.23841875791549683,
120
- "learning_rate": 4.2100739042801254e-05,
121
- "loss": 0.5261,
122
- "step": 8000
123
- },
124
- {
125
- "epoch": 0.3359285460222108,
126
- "grad_norm": 0.24955500662326813,
127
- "learning_rate": 4.160672647512153e-05,
128
- "loss": 0.5184,
129
- "step": 8500
130
- },
131
- {
132
- "epoch": 0.35568904872939966,
133
- "grad_norm": 0.28650936484336853,
134
- "learning_rate": 4.1112713907441805e-05,
135
- "loss": 0.5092,
136
- "step": 9000
137
- },
138
- {
139
- "epoch": 0.37544955143658854,
140
- "grad_norm": 0.26088303327560425,
141
- "learning_rate": 4.061870133976208e-05,
142
- "loss": 0.4939,
143
- "step": 9500
144
- },
145
- {
146
- "epoch": 0.3952100541437774,
147
- "grad_norm": 0.27843499183654785,
148
- "learning_rate": 4.012468877208236e-05,
149
- "loss": 0.493,
150
- "step": 10000
151
- },
152
- {
153
- "epoch": 0.4149705568509663,
154
- "grad_norm": 0.45379650592803955,
155
- "learning_rate": 3.963067620440264e-05,
156
- "loss": 0.4824,
157
- "step": 10500
158
- },
159
- {
160
- "epoch": 0.43473105955815516,
161
- "grad_norm": 0.29186248779296875,
162
- "learning_rate": 3.913666363672292e-05,
163
- "loss": 0.4743,
164
- "step": 11000
165
- },
166
- {
167
- "epoch": 0.45449156226534404,
168
- "grad_norm": 0.2711990475654602,
169
- "learning_rate": 3.86426510690432e-05,
170
- "loss": 0.4684,
171
- "step": 11500
172
- },
173
- {
174
- "epoch": 0.4742520649725329,
175
- "grad_norm": 0.2682012915611267,
176
- "learning_rate": 3.814863850136348e-05,
177
- "loss": 0.4674,
178
- "step": 12000
179
- },
180
- {
181
- "epoch": 0.4940125676797218,
182
- "grad_norm": 0.3144526183605194,
183
- "learning_rate": 3.765462593368376e-05,
184
- "loss": 0.4565,
185
- "step": 12500
186
- },
187
- {
188
- "epoch": 0.5137730703869107,
189
- "grad_norm": 0.24828797578811646,
190
- "learning_rate": 3.7160613366004034e-05,
191
- "loss": 0.4601,
192
- "step": 13000
193
- },
194
- {
195
- "epoch": 0.5335335730940995,
196
- "grad_norm": 0.23556892573833466,
197
- "learning_rate": 3.6666600798324317e-05,
198
- "loss": 0.4456,
199
- "step": 13500
200
- },
201
- {
202
- "epoch": 0.5532940758012884,
203
- "grad_norm": 0.20168079435825348,
204
- "learning_rate": 3.617258823064459e-05,
205
- "loss": 0.4416,
206
- "step": 14000
207
- },
208
- {
209
- "epoch": 0.5730545785084773,
210
- "grad_norm": 0.26364487409591675,
211
- "learning_rate": 3.567857566296487e-05,
212
- "loss": 0.4394,
213
- "step": 14500
214
- },
215
- {
216
- "epoch": 0.5928150812156662,
217
- "grad_norm": 0.22534222900867462,
218
- "learning_rate": 3.518456309528514e-05,
219
- "loss": 0.4371,
220
- "step": 15000
221
- },
222
- {
223
- "epoch": 0.612575583922855,
224
- "grad_norm": 0.2668949067592621,
225
- "learning_rate": 3.4690550527605425e-05,
226
- "loss": 0.4292,
227
- "step": 15500
228
- },
229
- {
230
- "epoch": 0.6323360866300438,
231
- "grad_norm": 0.26472756266593933,
232
- "learning_rate": 3.41965379599257e-05,
233
- "loss": 0.4271,
234
- "step": 16000
235
- },
236
- {
237
- "epoch": 0.6520965893372327,
238
- "grad_norm": 0.238133504986763,
239
- "learning_rate": 3.3703513417381335e-05,
240
- "loss": 0.4214,
241
- "step": 16500
242
- },
243
- {
244
- "epoch": 0.6718570920444216,
245
- "grad_norm": 0.19885075092315674,
246
- "learning_rate": 3.320950084970162e-05,
247
- "loss": 0.4178,
248
- "step": 17000
249
- },
250
- {
251
- "epoch": 0.6916175947516104,
252
- "grad_norm": 0.2647489309310913,
253
- "learning_rate": 3.271548828202189e-05,
254
- "loss": 0.4171,
255
- "step": 17500
256
- },
257
- {
258
- "epoch": 0.7113780974587993,
259
- "grad_norm": 0.2713819444179535,
260
- "learning_rate": 3.2221475714342175e-05,
261
- "loss": 0.4152,
262
- "step": 18000
263
- },
264
- {
265
- "epoch": 0.7311386001659882,
266
- "grad_norm": 0.21879780292510986,
267
- "learning_rate": 3.172845117179781e-05,
268
- "loss": 0.4124,
269
- "step": 18500
270
- },
271
- {
272
- "epoch": 0.7508991028731771,
273
- "grad_norm": 0.21214431524276733,
274
- "learning_rate": 3.1234438604118085e-05,
275
- "loss": 0.4046,
276
- "step": 19000
277
- },
278
- {
279
- "epoch": 0.770659605580366,
280
- "grad_norm": 0.23132909834384918,
281
- "learning_rate": 3.074042603643837e-05,
282
- "loss": 0.4074,
283
- "step": 19500
284
- },
285
- {
286
- "epoch": 0.7904201082875548,
287
- "grad_norm": 0.25402024388313293,
288
- "learning_rate": 3.0246413468758646e-05,
289
- "loss": 0.4025,
290
- "step": 20000
291
- },
292
- {
293
- "epoch": 0.8101806109947437,
294
- "grad_norm": 0.24344465136528015,
295
- "learning_rate": 2.9754376951349644e-05,
296
- "loss": 0.3954,
297
- "step": 20500
298
- },
299
- {
300
- "epoch": 0.8299411137019326,
301
- "grad_norm": 0.35887858271598816,
302
- "learning_rate": 2.926036438366992e-05,
303
- "loss": 0.3999,
304
- "step": 21000
305
- },
306
- {
307
- "epoch": 0.8497016164091215,
308
- "grad_norm": 0.2312784641981125,
309
- "learning_rate": 2.87663518159902e-05,
310
- "loss": 0.3953,
311
- "step": 21500
312
- },
313
- {
314
- "epoch": 0.8694621191163103,
315
- "grad_norm": 0.25101277232170105,
316
- "learning_rate": 2.8273327273445837e-05,
317
- "loss": 0.394,
318
- "step": 22000
319
- },
320
- {
321
- "epoch": 0.8892226218234992,
322
- "grad_norm": 0.2582937479019165,
323
- "learning_rate": 2.777931470576612e-05,
324
- "loss": 0.3936,
325
- "step": 22500
326
- },
327
- {
328
- "epoch": 0.9089831245306881,
329
- "grad_norm": 0.20885373651981354,
330
- "learning_rate": 2.7285302138086394e-05,
331
- "loss": 0.384,
332
- "step": 23000
333
- },
334
- {
335
- "epoch": 0.928743627237877,
336
- "grad_norm": 0.23662053048610687,
337
- "learning_rate": 2.6791289570406676e-05,
338
- "loss": 0.3818,
339
- "step": 23500
340
- },
341
- {
342
- "epoch": 0.9485041299450658,
343
- "grad_norm": 0.23784968256950378,
344
- "learning_rate": 2.629727700272695e-05,
345
- "loss": 0.3884,
346
- "step": 24000
347
- },
348
- {
349
- "epoch": 0.9682646326522547,
350
- "grad_norm": 0.24405881762504578,
351
- "learning_rate": 2.5804252460182587e-05,
352
- "loss": 0.384,
353
- "step": 24500
354
- },
355
- {
356
- "epoch": 0.9880251353594436,
357
- "grad_norm": 0.18989978730678558,
358
- "learning_rate": 2.531023989250287e-05,
359
- "loss": 0.3821,
360
- "step": 25000
361
- },
362
- {
363
- "epoch": 1.0077856380666323,
364
- "grad_norm": 0.22834104299545288,
365
- "learning_rate": 2.4816227324823144e-05,
366
- "loss": 0.3801,
367
- "step": 25500
368
- },
369
- {
370
- "epoch": 1.0275461407738213,
371
- "grad_norm": 0.30803126096725464,
372
- "learning_rate": 2.4322214757143423e-05,
373
- "loss": 0.3739,
374
- "step": 26000
375
- },
376
- {
377
- "epoch": 1.04730664348101,
378
- "grad_norm": 0.21231615543365479,
379
- "learning_rate": 2.3828202189463698e-05,
380
- "loss": 0.3791,
381
- "step": 26500
382
- },
383
- {
384
- "epoch": 1.067067146188199,
385
- "grad_norm": 0.26202794909477234,
386
- "learning_rate": 2.333418962178398e-05,
387
- "loss": 0.3739,
388
- "step": 27000
389
- },
390
- {
391
- "epoch": 1.0868276488953879,
392
- "grad_norm": 0.21842999756336212,
393
- "learning_rate": 2.284017705410426e-05,
394
- "loss": 0.377,
395
- "step": 27500
396
- },
397
- {
398
- "epoch": 1.1065881516025768,
399
- "grad_norm": 0.22890307009220123,
400
- "learning_rate": 2.2346164486424538e-05,
401
- "loss": 0.3733,
402
- "step": 28000
403
- },
404
- {
405
- "epoch": 1.1263486543097656,
406
- "grad_norm": 0.20813824236392975,
407
- "learning_rate": 2.1852151918744816e-05,
408
- "loss": 0.37,
409
- "step": 28500
410
- },
411
- {
412
- "epoch": 1.1461091570169546,
413
- "grad_norm": 0.2490423172712326,
414
- "learning_rate": 2.1358139351065092e-05,
415
- "loss": 0.3704,
416
- "step": 29000
417
- },
418
- {
419
- "epoch": 1.1658696597241434,
420
- "grad_norm": 0.20076008141040802,
421
- "learning_rate": 2.086511480852073e-05,
422
- "loss": 0.3733,
423
- "step": 29500
424
- },
425
- {
426
- "epoch": 1.1856301624313323,
427
- "grad_norm": 0.20791497826576233,
428
- "learning_rate": 2.037110224084101e-05,
429
- "loss": 0.3682,
430
- "step": 30000
431
- },
432
- {
433
- "epoch": 1.2053906651385211,
434
- "grad_norm": 0.3393056094646454,
435
- "learning_rate": 1.9878077698296648e-05,
436
- "loss": 0.3687,
437
- "step": 30500
438
- },
439
- {
440
- "epoch": 1.22515116784571,
441
- "grad_norm": 0.21395719051361084,
442
- "learning_rate": 1.9384065130616923e-05,
443
- "loss": 0.3649,
444
- "step": 31000
445
- },
446
- {
447
- "epoch": 1.2449116705528989,
448
- "grad_norm": 0.23558488488197327,
449
- "learning_rate": 1.889104058807256e-05,
450
- "loss": 0.366,
451
- "step": 31500
452
- },
453
- {
454
- "epoch": 1.2646721732600876,
455
- "grad_norm": 0.2056606411933899,
456
- "learning_rate": 1.839702802039284e-05,
457
- "loss": 0.363,
458
- "step": 32000
459
- },
460
- {
461
- "epoch": 1.2844326759672766,
462
- "grad_norm": 0.2141130417585373,
463
- "learning_rate": 1.790400347784848e-05,
464
- "loss": 0.3671,
465
- "step": 32500
466
- },
467
- {
468
- "epoch": 1.3041931786744656,
469
- "grad_norm": 0.20828750729560852,
470
- "learning_rate": 1.7409990910168754e-05,
471
- "loss": 0.3635,
472
- "step": 33000
473
- },
474
- {
475
- "epoch": 1.3239536813816544,
476
- "grad_norm": 0.18149369955062866,
477
- "learning_rate": 1.6915978342489033e-05,
478
- "loss": 0.3637,
479
- "step": 33500
480
- },
481
- {
482
- "epoch": 1.3437141840888431,
483
- "grad_norm": 0.21469901502132416,
484
- "learning_rate": 1.642196577480931e-05,
485
- "loss": 0.3586,
486
- "step": 34000
487
- },
488
- {
489
- "epoch": 1.3634746867960321,
490
- "grad_norm": 0.22413115203380585,
491
- "learning_rate": 1.592894123226495e-05,
492
- "loss": 0.3596,
493
- "step": 34500
494
- },
495
- {
496
- "epoch": 1.3832351895032209,
497
- "grad_norm": 0.2113995999097824,
498
- "learning_rate": 1.543492866458523e-05,
499
- "loss": 0.3547,
500
- "step": 35000
501
- },
502
- {
503
- "epoch": 1.4029956922104099,
504
- "grad_norm": 0.19711661338806152,
505
- "learning_rate": 1.4940916096905506e-05,
506
- "loss": 0.3601,
507
- "step": 35500
508
- },
509
- {
510
- "epoch": 1.4227561949175986,
511
- "grad_norm": 0.24169065058231354,
512
- "learning_rate": 1.4447891554361142e-05,
513
- "loss": 0.3582,
514
- "step": 36000
515
- },
516
- {
517
- "epoch": 1.4425166976247876,
518
- "grad_norm": 0.2239445596933365,
519
- "learning_rate": 1.3953878986681421e-05,
520
- "loss": 0.358,
521
- "step": 36500
522
- },
523
- {
524
- "epoch": 1.4622772003319764,
525
- "grad_norm": 0.25448787212371826,
526
- "learning_rate": 1.3459866419001698e-05,
527
- "loss": 0.3573,
528
- "step": 37000
529
- },
530
- {
531
- "epoch": 1.4820377030391654,
532
- "grad_norm": 0.20836737751960754,
533
- "learning_rate": 1.296585385132198e-05,
534
- "loss": 0.3533,
535
- "step": 37500
536
- },
537
- {
538
- "epoch": 1.5017982057463541,
539
- "grad_norm": 0.18163880705833435,
540
- "learning_rate": 1.2471841283642256e-05,
541
- "loss": 0.3564,
542
- "step": 38000
543
- },
544
- {
545
- "epoch": 1.5215587084535431,
546
- "grad_norm": 0.23112539947032928,
547
- "learning_rate": 1.1978816741097894e-05,
548
- "loss": 0.3516,
549
- "step": 38500
550
- },
551
- {
552
- "epoch": 1.541319211160732,
553
- "grad_norm": 0.18698620796203613,
554
- "learning_rate": 1.1484804173418171e-05,
555
- "loss": 0.3534,
556
- "step": 39000
557
- },
558
- {
559
- "epoch": 1.5610797138679207,
560
- "grad_norm": 0.2288573682308197,
561
- "learning_rate": 1.099079160573845e-05,
562
- "loss": 0.3527,
563
- "step": 39500
564
- },
565
- {
566
- "epoch": 1.5808402165751096,
567
- "grad_norm": 0.22100871801376343,
568
- "learning_rate": 1.0496779038058729e-05,
569
- "loss": 0.3504,
570
- "step": 40000
571
- },
572
- {
573
- "epoch": 1.6006007192822986,
574
- "grad_norm": 0.24465763568878174,
575
- "learning_rate": 1.0002766470379007e-05,
576
- "loss": 0.3526,
577
- "step": 40500
578
- },
579
- {
580
- "epoch": 1.6203612219894874,
581
- "grad_norm": 0.22944742441177368,
582
- "learning_rate": 9.508753902699286e-06,
583
- "loss": 0.3544,
584
- "step": 41000
585
- },
586
- {
587
- "epoch": 1.6401217246966762,
588
- "grad_norm": 0.22519248723983765,
589
- "learning_rate": 9.015729360154923e-06,
590
- "loss": 0.3507,
591
- "step": 41500
592
- },
593
- {
594
- "epoch": 1.6598822274038652,
595
- "grad_norm": 0.21818549931049347,
596
- "learning_rate": 8.521716792475202e-06,
597
- "loss": 0.3447,
598
- "step": 42000
599
- },
600
- {
601
- "epoch": 1.6796427301110541,
602
- "grad_norm": 0.23688144981861115,
603
- "learning_rate": 8.027704224795479e-06,
604
- "loss": 0.3485,
605
- "step": 42500
606
- },
607
- {
608
- "epoch": 1.699403232818243,
609
- "grad_norm": 0.2965475916862488,
610
- "learning_rate": 7.533691657115757e-06,
611
- "loss": 0.3482,
612
- "step": 43000
613
- },
614
- {
615
- "epoch": 1.7191637355254317,
616
- "grad_norm": 0.21588334441184998,
617
- "learning_rate": 7.039679089436035e-06,
618
- "loss": 0.3487,
619
- "step": 43500
620
- },
621
- {
622
- "epoch": 1.7389242382326207,
623
- "grad_norm": 0.25196942687034607,
624
- "learning_rate": 6.545666521756313e-06,
625
- "loss": 0.347,
626
- "step": 44000
627
- },
628
- {
629
- "epoch": 1.7586847409398096,
630
- "grad_norm": 0.2386660873889923,
631
- "learning_rate": 6.051653954076593e-06,
632
- "loss": 0.3493,
633
- "step": 44500
634
- },
635
- {
636
- "epoch": 1.7784452436469984,
637
- "grad_norm": 0.21771515905857086,
638
- "learning_rate": 5.5586294115322294e-06,
639
- "loss": 0.3412,
640
- "step": 45000
641
- },
642
- {
643
- "epoch": 1.7982057463541872,
644
- "grad_norm": 0.20925763249397278,
645
- "learning_rate": 5.064616843852508e-06,
646
- "loss": 0.3451,
647
- "step": 45500
648
- },
649
- {
650
- "epoch": 1.8179662490613762,
651
- "grad_norm": 0.21637707948684692,
652
- "learning_rate": 4.570604276172786e-06,
653
- "loss": 0.3483,
654
- "step": 46000
655
- },
656
- {
657
- "epoch": 1.8377267517685651,
658
- "grad_norm": 0.21756745874881744,
659
- "learning_rate": 4.076591708493064e-06,
660
- "loss": 0.3518,
661
- "step": 46500
662
- },
663
- {
664
- "epoch": 1.857487254475754,
665
- "grad_norm": 0.21370700001716614,
666
- "learning_rate": 3.583567165948702e-06,
667
- "loss": 0.3489,
668
- "step": 47000
669
- },
670
- {
671
- "epoch": 1.8772477571829427,
672
- "grad_norm": 0.27830684185028076,
673
- "learning_rate": 3.0895545982689803e-06,
674
- "loss": 0.3451,
675
- "step": 47500
676
- },
677
- {
678
- "epoch": 1.8970082598901317,
679
- "grad_norm": 0.225153848528862,
680
- "learning_rate": 2.595542030589258e-06,
681
- "loss": 0.3451,
682
- "step": 48000
683
- },
684
- {
685
- "epoch": 1.9167687625973204,
686
- "grad_norm": 0.2039685696363449,
687
- "learning_rate": 2.1015294629095364e-06,
688
- "loss": 0.349,
689
- "step": 48500
690
- },
691
- {
692
- "epoch": 1.9365292653045092,
693
- "grad_norm": 0.3599139153957367,
694
- "learning_rate": 1.607516895229815e-06,
695
- "loss": 0.3486,
696
- "step": 49000
697
- },
698
- {
699
- "epoch": 1.9562897680116982,
700
- "grad_norm": 0.2884109616279602,
701
- "learning_rate": 1.113504327550093e-06,
702
- "loss": 0.343,
703
- "step": 49500
704
- },
705
- {
706
- "epoch": 1.9760502707188872,
707
- "grad_norm": 0.2130923867225647,
708
- "learning_rate": 6.194917598703712e-07,
709
- "loss": 0.347,
710
- "step": 50000
711
- }
712
- ],
713
- "logging_steps": 500,
714
- "max_steps": 50606,
715
- "num_input_tokens_seen": 0,
716
- "num_train_epochs": 2,
717
- "save_steps": 500,
718
- "stateful_callbacks": {
719
- "TrainerControl": {
720
- "args": {
721
- "should_epoch_stop": false,
722
- "should_evaluate": false,
723
- "should_log": false,
724
- "should_save": true,
725
- "should_training_stop": false
726
- },
727
- "attributes": {}
728
- }
729
- },
730
- "total_flos": 1.0827337350669926e+17,
731
- "train_batch_size": 32,
732
- "trial_name": null,
733
- "trial_params": null
734
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/checkpoint-50500/trainer_state.json DELETED
@@ -1,741 +0,0 @@
1
- {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 1.995810773426076,
6
- "eval_steps": 500,
7
- "global_step": 50500,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.01976050270718887,
14
- "grad_norm": 0.40364792943000793,
15
- "learning_rate": 4.951092755799708e-05,
16
- "loss": 2.1373,
17
- "step": 500
18
- },
19
- {
20
- "epoch": 0.03952100541437774,
21
- "grad_norm": 0.5190012454986572,
22
- "learning_rate": 4.901691499031736e-05,
23
- "loss": 1.1731,
24
- "step": 1000
25
- },
26
- {
27
- "epoch": 0.059281508121566615,
28
- "grad_norm": 0.44221577048301697,
29
- "learning_rate": 4.8522902422637634e-05,
30
- "loss": 0.9673,
31
- "step": 1500
32
- },
33
- {
34
- "epoch": 0.07904201082875548,
35
- "grad_norm": 0.3572881817817688,
36
- "learning_rate": 4.8028889854957916e-05,
37
- "loss": 0.8537,
38
- "step": 2000
39
- },
40
- {
41
- "epoch": 0.09880251353594435,
42
- "grad_norm": 0.36681538820266724,
43
- "learning_rate": 4.753487728727819e-05,
44
- "loss": 0.77,
45
- "step": 2500
46
- },
47
- {
48
- "epoch": 0.11856301624313323,
49
- "grad_norm": 0.3278830647468567,
50
- "learning_rate": 4.7040864719598474e-05,
51
- "loss": 0.7379,
52
- "step": 3000
53
- },
54
- {
55
- "epoch": 0.1383235189503221,
56
- "grad_norm": 0.32520031929016113,
57
- "learning_rate": 4.654685215191875e-05,
58
- "loss": 0.6918,
59
- "step": 3500
60
- },
61
- {
62
- "epoch": 0.15808402165751095,
63
- "grad_norm": 0.3115929961204529,
64
- "learning_rate": 4.6052839584239024e-05,
65
- "loss": 0.6635,
66
- "step": 4000
67
- },
68
- {
69
- "epoch": 0.17784452436469983,
70
- "grad_norm": 0.309794157743454,
71
- "learning_rate": 4.55588270165593e-05,
72
- "loss": 0.6363,
73
- "step": 4500
74
- },
75
- {
76
- "epoch": 0.1976050270718887,
77
- "grad_norm": 0.3163657784461975,
78
- "learning_rate": 4.506481444887958e-05,
79
- "loss": 0.6113,
80
- "step": 5000
81
- },
82
- {
83
- "epoch": 0.21736552977907758,
84
- "grad_norm": 0.27902352809906006,
85
- "learning_rate": 4.457080188119986e-05,
86
- "loss": 0.5899,
87
- "step": 5500
88
- },
89
- {
90
- "epoch": 0.23712603248626646,
91
- "grad_norm": 0.26167234778404236,
92
- "learning_rate": 4.407678931352014e-05,
93
- "loss": 0.58,
94
- "step": 6000
95
- },
96
- {
97
- "epoch": 0.25688653519345533,
98
- "grad_norm": 0.30986905097961426,
99
- "learning_rate": 4.3582776745840415e-05,
100
- "loss": 0.5632,
101
- "step": 6500
102
- },
103
- {
104
- "epoch": 0.2766470379006442,
105
- "grad_norm": 0.28008660674095154,
106
- "learning_rate": 4.30887641781607e-05,
107
- "loss": 0.5478,
108
- "step": 7000
109
- },
110
- {
111
- "epoch": 0.2964075406078331,
112
- "grad_norm": 0.2860545516014099,
113
- "learning_rate": 4.259475161048097e-05,
114
- "loss": 0.541,
115
- "step": 7500
116
- },
117
- {
118
- "epoch": 0.3161680433150219,
119
- "grad_norm": 0.23841875791549683,
120
- "learning_rate": 4.2100739042801254e-05,
121
- "loss": 0.5261,
122
- "step": 8000
123
- },
124
- {
125
- "epoch": 0.3359285460222108,
126
- "grad_norm": 0.24955500662326813,
127
- "learning_rate": 4.160672647512153e-05,
128
- "loss": 0.5184,
129
- "step": 8500
130
- },
131
- {
132
- "epoch": 0.35568904872939966,
133
- "grad_norm": 0.28650936484336853,
134
- "learning_rate": 4.1112713907441805e-05,
135
- "loss": 0.5092,
136
- "step": 9000
137
- },
138
- {
139
- "epoch": 0.37544955143658854,
140
- "grad_norm": 0.26088303327560425,
141
- "learning_rate": 4.061870133976208e-05,
142
- "loss": 0.4939,
143
- "step": 9500
144
- },
145
- {
146
- "epoch": 0.3952100541437774,
147
- "grad_norm": 0.27843499183654785,
148
- "learning_rate": 4.012468877208236e-05,
149
- "loss": 0.493,
150
- "step": 10000
151
- },
152
- {
153
- "epoch": 0.4149705568509663,
154
- "grad_norm": 0.45379650592803955,
155
- "learning_rate": 3.963067620440264e-05,
156
- "loss": 0.4824,
157
- "step": 10500
158
- },
159
- {
160
- "epoch": 0.43473105955815516,
161
- "grad_norm": 0.29186248779296875,
162
- "learning_rate": 3.913666363672292e-05,
163
- "loss": 0.4743,
164
- "step": 11000
165
- },
166
- {
167
- "epoch": 0.45449156226534404,
168
- "grad_norm": 0.2711990475654602,
169
- "learning_rate": 3.86426510690432e-05,
170
- "loss": 0.4684,
171
- "step": 11500
172
- },
173
- {
174
- "epoch": 0.4742520649725329,
175
- "grad_norm": 0.2682012915611267,
176
- "learning_rate": 3.814863850136348e-05,
177
- "loss": 0.4674,
178
- "step": 12000
179
- },
180
- {
181
- "epoch": 0.4940125676797218,
182
- "grad_norm": 0.3144526183605194,
183
- "learning_rate": 3.765462593368376e-05,
184
- "loss": 0.4565,
185
- "step": 12500
186
- },
187
- {
188
- "epoch": 0.5137730703869107,
189
- "grad_norm": 0.24828797578811646,
190
- "learning_rate": 3.7160613366004034e-05,
191
- "loss": 0.4601,
192
- "step": 13000
193
- },
194
- {
195
- "epoch": 0.5335335730940995,
196
- "grad_norm": 0.23556892573833466,
197
- "learning_rate": 3.6666600798324317e-05,
198
- "loss": 0.4456,
199
- "step": 13500
200
- },
201
- {
202
- "epoch": 0.5532940758012884,
203
- "grad_norm": 0.20168079435825348,
204
- "learning_rate": 3.617258823064459e-05,
205
- "loss": 0.4416,
206
- "step": 14000
207
- },
208
- {
209
- "epoch": 0.5730545785084773,
210
- "grad_norm": 0.26364487409591675,
211
- "learning_rate": 3.567857566296487e-05,
212
- "loss": 0.4394,
213
- "step": 14500
214
- },
215
- {
216
- "epoch": 0.5928150812156662,
217
- "grad_norm": 0.22534222900867462,
218
- "learning_rate": 3.518456309528514e-05,
219
- "loss": 0.4371,
220
- "step": 15000
221
- },
222
- {
223
- "epoch": 0.612575583922855,
224
- "grad_norm": 0.2668949067592621,
225
- "learning_rate": 3.4690550527605425e-05,
226
- "loss": 0.4292,
227
- "step": 15500
228
- },
229
- {
230
- "epoch": 0.6323360866300438,
231
- "grad_norm": 0.26472756266593933,
232
- "learning_rate": 3.41965379599257e-05,
233
- "loss": 0.4271,
234
- "step": 16000
235
- },
236
- {
237
- "epoch": 0.6520965893372327,
238
- "grad_norm": 0.238133504986763,
239
- "learning_rate": 3.3703513417381335e-05,
240
- "loss": 0.4214,
241
- "step": 16500
242
- },
243
- {
244
- "epoch": 0.6718570920444216,
245
- "grad_norm": 0.19885075092315674,
246
- "learning_rate": 3.320950084970162e-05,
247
- "loss": 0.4178,
248
- "step": 17000
249
- },
250
- {
251
- "epoch": 0.6916175947516104,
252
- "grad_norm": 0.2647489309310913,
253
- "learning_rate": 3.271548828202189e-05,
254
- "loss": 0.4171,
255
- "step": 17500
256
- },
257
- {
258
- "epoch": 0.7113780974587993,
259
- "grad_norm": 0.2713819444179535,
260
- "learning_rate": 3.2221475714342175e-05,
261
- "loss": 0.4152,
262
- "step": 18000
263
- },
264
- {
265
- "epoch": 0.7311386001659882,
266
- "grad_norm": 0.21879780292510986,
267
- "learning_rate": 3.172845117179781e-05,
268
- "loss": 0.4124,
269
- "step": 18500
270
- },
271
- {
272
- "epoch": 0.7508991028731771,
273
- "grad_norm": 0.21214431524276733,
274
- "learning_rate": 3.1234438604118085e-05,
275
- "loss": 0.4046,
276
- "step": 19000
277
- },
278
- {
279
- "epoch": 0.770659605580366,
280
- "grad_norm": 0.23132909834384918,
281
- "learning_rate": 3.074042603643837e-05,
282
- "loss": 0.4074,
283
- "step": 19500
284
- },
285
- {
286
- "epoch": 0.7904201082875548,
287
- "grad_norm": 0.25402024388313293,
288
- "learning_rate": 3.0246413468758646e-05,
289
- "loss": 0.4025,
290
- "step": 20000
291
- },
292
- {
293
- "epoch": 0.8101806109947437,
294
- "grad_norm": 0.24344465136528015,
295
- "learning_rate": 2.9754376951349644e-05,
296
- "loss": 0.3954,
297
- "step": 20500
298
- },
299
- {
300
- "epoch": 0.8299411137019326,
301
- "grad_norm": 0.35887858271598816,
302
- "learning_rate": 2.926036438366992e-05,
303
- "loss": 0.3999,
304
- "step": 21000
305
- },
306
- {
307
- "epoch": 0.8497016164091215,
308
- "grad_norm": 0.2312784641981125,
309
- "learning_rate": 2.87663518159902e-05,
310
- "loss": 0.3953,
311
- "step": 21500
312
- },
313
- {
314
- "epoch": 0.8694621191163103,
315
- "grad_norm": 0.25101277232170105,
316
- "learning_rate": 2.8273327273445837e-05,
317
- "loss": 0.394,
318
- "step": 22000
319
- },
320
- {
321
- "epoch": 0.8892226218234992,
322
- "grad_norm": 0.2582937479019165,
323
- "learning_rate": 2.777931470576612e-05,
324
- "loss": 0.3936,
325
- "step": 22500
326
- },
327
- {
328
- "epoch": 0.9089831245306881,
329
- "grad_norm": 0.20885373651981354,
330
- "learning_rate": 2.7285302138086394e-05,
331
- "loss": 0.384,
332
- "step": 23000
333
- },
334
- {
335
- "epoch": 0.928743627237877,
336
- "grad_norm": 0.23662053048610687,
337
- "learning_rate": 2.6791289570406676e-05,
338
- "loss": 0.3818,
339
- "step": 23500
340
- },
341
- {
342
- "epoch": 0.9485041299450658,
343
- "grad_norm": 0.23784968256950378,
344
- "learning_rate": 2.629727700272695e-05,
345
- "loss": 0.3884,
346
- "step": 24000
347
- },
348
- {
349
- "epoch": 0.9682646326522547,
350
- "grad_norm": 0.24405881762504578,
351
- "learning_rate": 2.5804252460182587e-05,
352
- "loss": 0.384,
353
- "step": 24500
354
- },
355
- {
356
- "epoch": 0.9880251353594436,
357
- "grad_norm": 0.18989978730678558,
358
- "learning_rate": 2.531023989250287e-05,
359
- "loss": 0.3821,
360
- "step": 25000
361
- },
362
- {
363
- "epoch": 1.0077856380666323,
364
- "grad_norm": 0.22834104299545288,
365
- "learning_rate": 2.4816227324823144e-05,
366
- "loss": 0.3801,
367
- "step": 25500
368
- },
369
- {
370
- "epoch": 1.0275461407738213,
371
- "grad_norm": 0.30803126096725464,
372
- "learning_rate": 2.4322214757143423e-05,
373
- "loss": 0.3739,
374
- "step": 26000
375
- },
376
- {
377
- "epoch": 1.04730664348101,
378
- "grad_norm": 0.21231615543365479,
379
- "learning_rate": 2.3828202189463698e-05,
380
- "loss": 0.3791,
381
- "step": 26500
382
- },
383
- {
384
- "epoch": 1.067067146188199,
385
- "grad_norm": 0.26202794909477234,
386
- "learning_rate": 2.333418962178398e-05,
387
- "loss": 0.3739,
388
- "step": 27000
389
- },
390
- {
391
- "epoch": 1.0868276488953879,
392
- "grad_norm": 0.21842999756336212,
393
- "learning_rate": 2.284017705410426e-05,
394
- "loss": 0.377,
395
- "step": 27500
396
- },
397
- {
398
- "epoch": 1.1065881516025768,
399
- "grad_norm": 0.22890307009220123,
400
- "learning_rate": 2.2346164486424538e-05,
401
- "loss": 0.3733,
402
- "step": 28000
403
- },
404
- {
405
- "epoch": 1.1263486543097656,
406
- "grad_norm": 0.20813824236392975,
407
- "learning_rate": 2.1852151918744816e-05,
408
- "loss": 0.37,
409
- "step": 28500
410
- },
411
- {
412
- "epoch": 1.1461091570169546,
413
- "grad_norm": 0.2490423172712326,
414
- "learning_rate": 2.1358139351065092e-05,
415
- "loss": 0.3704,
416
- "step": 29000
417
- },
418
- {
419
- "epoch": 1.1658696597241434,
420
- "grad_norm": 0.20076008141040802,
421
- "learning_rate": 2.086511480852073e-05,
422
- "loss": 0.3733,
423
- "step": 29500
424
- },
425
- {
426
- "epoch": 1.1856301624313323,
427
- "grad_norm": 0.20791497826576233,
428
- "learning_rate": 2.037110224084101e-05,
429
- "loss": 0.3682,
430
- "step": 30000
431
- },
432
- {
433
- "epoch": 1.2053906651385211,
434
- "grad_norm": 0.3393056094646454,
435
- "learning_rate": 1.9878077698296648e-05,
436
- "loss": 0.3687,
437
- "step": 30500
438
- },
439
- {
440
- "epoch": 1.22515116784571,
441
- "grad_norm": 0.21395719051361084,
442
- "learning_rate": 1.9384065130616923e-05,
443
- "loss": 0.3649,
444
- "step": 31000
445
- },
446
- {
447
- "epoch": 1.2449116705528989,
448
- "grad_norm": 0.23558488488197327,
449
- "learning_rate": 1.889104058807256e-05,
450
- "loss": 0.366,
451
- "step": 31500
452
- },
453
- {
454
- "epoch": 1.2646721732600876,
455
- "grad_norm": 0.2056606411933899,
456
- "learning_rate": 1.839702802039284e-05,
457
- "loss": 0.363,
458
- "step": 32000
459
- },
460
- {
461
- "epoch": 1.2844326759672766,
462
- "grad_norm": 0.2141130417585373,
463
- "learning_rate": 1.790400347784848e-05,
464
- "loss": 0.3671,
465
- "step": 32500
466
- },
467
- {
468
- "epoch": 1.3041931786744656,
469
- "grad_norm": 0.20828750729560852,
470
- "learning_rate": 1.7409990910168754e-05,
471
- "loss": 0.3635,
472
- "step": 33000
473
- },
474
- {
475
- "epoch": 1.3239536813816544,
476
- "grad_norm": 0.18149369955062866,
477
- "learning_rate": 1.6915978342489033e-05,
478
- "loss": 0.3637,
479
- "step": 33500
480
- },
481
- {
482
- "epoch": 1.3437141840888431,
483
- "grad_norm": 0.21469901502132416,
484
- "learning_rate": 1.642196577480931e-05,
485
- "loss": 0.3586,
486
- "step": 34000
487
- },
488
- {
489
- "epoch": 1.3634746867960321,
490
- "grad_norm": 0.22413115203380585,
491
- "learning_rate": 1.592894123226495e-05,
492
- "loss": 0.3596,
493
- "step": 34500
494
- },
495
- {
496
- "epoch": 1.3832351895032209,
497
- "grad_norm": 0.2113995999097824,
498
- "learning_rate": 1.543492866458523e-05,
499
- "loss": 0.3547,
500
- "step": 35000
501
- },
502
- {
503
- "epoch": 1.4029956922104099,
504
- "grad_norm": 0.19711661338806152,
505
- "learning_rate": 1.4940916096905506e-05,
506
- "loss": 0.3601,
507
- "step": 35500
508
- },
509
- {
510
- "epoch": 1.4227561949175986,
511
- "grad_norm": 0.24169065058231354,
512
- "learning_rate": 1.4447891554361142e-05,
513
- "loss": 0.3582,
514
- "step": 36000
515
- },
516
- {
517
- "epoch": 1.4425166976247876,
518
- "grad_norm": 0.2239445596933365,
519
- "learning_rate": 1.3953878986681421e-05,
520
- "loss": 0.358,
521
- "step": 36500
522
- },
523
- {
524
- "epoch": 1.4622772003319764,
525
- "grad_norm": 0.25448787212371826,
526
- "learning_rate": 1.3459866419001698e-05,
527
- "loss": 0.3573,
528
- "step": 37000
529
- },
530
- {
531
- "epoch": 1.4820377030391654,
532
- "grad_norm": 0.20836737751960754,
533
- "learning_rate": 1.296585385132198e-05,
534
- "loss": 0.3533,
535
- "step": 37500
536
- },
537
- {
538
- "epoch": 1.5017982057463541,
539
- "grad_norm": 0.18163880705833435,
540
- "learning_rate": 1.2471841283642256e-05,
541
- "loss": 0.3564,
542
- "step": 38000
543
- },
544
- {
545
- "epoch": 1.5215587084535431,
546
- "grad_norm": 0.23112539947032928,
547
- "learning_rate": 1.1978816741097894e-05,
548
- "loss": 0.3516,
549
- "step": 38500
550
- },
551
- {
552
- "epoch": 1.541319211160732,
553
- "grad_norm": 0.18698620796203613,
554
- "learning_rate": 1.1484804173418171e-05,
555
- "loss": 0.3534,
556
- "step": 39000
557
- },
558
- {
559
- "epoch": 1.5610797138679207,
560
- "grad_norm": 0.2288573682308197,
561
- "learning_rate": 1.099079160573845e-05,
562
- "loss": 0.3527,
563
- "step": 39500
564
- },
565
- {
566
- "epoch": 1.5808402165751096,
567
- "grad_norm": 0.22100871801376343,
568
- "learning_rate": 1.0496779038058729e-05,
569
- "loss": 0.3504,
570
- "step": 40000
571
- },
572
- {
573
- "epoch": 1.6006007192822986,
574
- "grad_norm": 0.24465763568878174,
575
- "learning_rate": 1.0002766470379007e-05,
576
- "loss": 0.3526,
577
- "step": 40500
578
- },
579
- {
580
- "epoch": 1.6203612219894874,
581
- "grad_norm": 0.22944742441177368,
582
- "learning_rate": 9.508753902699286e-06,
583
- "loss": 0.3544,
584
- "step": 41000
585
- },
586
- {
587
- "epoch": 1.6401217246966762,
588
- "grad_norm": 0.22519248723983765,
589
- "learning_rate": 9.015729360154923e-06,
590
- "loss": 0.3507,
591
- "step": 41500
592
- },
593
- {
594
- "epoch": 1.6598822274038652,
595
- "grad_norm": 0.21818549931049347,
596
- "learning_rate": 8.521716792475202e-06,
597
- "loss": 0.3447,
598
- "step": 42000
599
- },
600
- {
601
- "epoch": 1.6796427301110541,
602
- "grad_norm": 0.23688144981861115,
603
- "learning_rate": 8.027704224795479e-06,
604
- "loss": 0.3485,
605
- "step": 42500
606
- },
607
- {
608
- "epoch": 1.699403232818243,
609
- "grad_norm": 0.2965475916862488,
610
- "learning_rate": 7.533691657115757e-06,
611
- "loss": 0.3482,
612
- "step": 43000
613
- },
614
- {
615
- "epoch": 1.7191637355254317,
616
- "grad_norm": 0.21588334441184998,
617
- "learning_rate": 7.039679089436035e-06,
618
- "loss": 0.3487,
619
- "step": 43500
620
- },
621
- {
622
- "epoch": 1.7389242382326207,
623
- "grad_norm": 0.25196942687034607,
624
- "learning_rate": 6.545666521756313e-06,
625
- "loss": 0.347,
626
- "step": 44000
627
- },
628
- {
629
- "epoch": 1.7586847409398096,
630
- "grad_norm": 0.2386660873889923,
631
- "learning_rate": 6.051653954076593e-06,
632
- "loss": 0.3493,
633
- "step": 44500
634
- },
635
- {
636
- "epoch": 1.7784452436469984,
637
- "grad_norm": 0.21771515905857086,
638
- "learning_rate": 5.5586294115322294e-06,
639
- "loss": 0.3412,
640
- "step": 45000
641
- },
642
- {
643
- "epoch": 1.7982057463541872,
644
- "grad_norm": 0.20925763249397278,
645
- "learning_rate": 5.064616843852508e-06,
646
- "loss": 0.3451,
647
- "step": 45500
648
- },
649
- {
650
- "epoch": 1.8179662490613762,
651
- "grad_norm": 0.21637707948684692,
652
- "learning_rate": 4.570604276172786e-06,
653
- "loss": 0.3483,
654
- "step": 46000
655
- },
656
- {
657
- "epoch": 1.8377267517685651,
658
- "grad_norm": 0.21756745874881744,
659
- "learning_rate": 4.076591708493064e-06,
660
- "loss": 0.3518,
661
- "step": 46500
662
- },
663
- {
664
- "epoch": 1.857487254475754,
665
- "grad_norm": 0.21370700001716614,
666
- "learning_rate": 3.583567165948702e-06,
667
- "loss": 0.3489,
668
- "step": 47000
669
- },
670
- {
671
- "epoch": 1.8772477571829427,
672
- "grad_norm": 0.27830684185028076,
673
- "learning_rate": 3.0895545982689803e-06,
674
- "loss": 0.3451,
675
- "step": 47500
676
- },
677
- {
678
- "epoch": 1.8970082598901317,
679
- "grad_norm": 0.225153848528862,
680
- "learning_rate": 2.595542030589258e-06,
681
- "loss": 0.3451,
682
- "step": 48000
683
- },
684
- {
685
- "epoch": 1.9167687625973204,
686
- "grad_norm": 0.2039685696363449,
687
- "learning_rate": 2.1015294629095364e-06,
688
- "loss": 0.349,
689
- "step": 48500
690
- },
691
- {
692
- "epoch": 1.9365292653045092,
693
- "grad_norm": 0.3599139153957367,
694
- "learning_rate": 1.607516895229815e-06,
695
- "loss": 0.3486,
696
- "step": 49000
697
- },
698
- {
699
- "epoch": 1.9562897680116982,
700
- "grad_norm": 0.2884109616279602,
701
- "learning_rate": 1.113504327550093e-06,
702
- "loss": 0.343,
703
- "step": 49500
704
- },
705
- {
706
- "epoch": 1.9760502707188872,
707
- "grad_norm": 0.2130923867225647,
708
- "learning_rate": 6.194917598703712e-07,
709
- "loss": 0.347,
710
- "step": 50000
711
- },
712
- {
713
- "epoch": 1.995810773426076,
714
- "grad_norm": 0.21081362664699554,
715
- "learning_rate": 1.2547919219064935e-07,
716
- "loss": 0.3495,
717
- "step": 50500
718
- }
719
- ],
720
- "logging_steps": 500,
721
- "max_steps": 50606,
722
- "num_input_tokens_seen": 0,
723
- "num_train_epochs": 2,
724
- "save_steps": 500,
725
- "stateful_callbacks": {
726
- "TrainerControl": {
727
- "args": {
728
- "should_epoch_stop": false,
729
- "should_evaluate": false,
730
- "should_log": false,
731
- "should_save": true,
732
- "should_training_stop": false
733
- },
734
- "attributes": {}
735
- }
736
- },
737
- "total_flos": 1.0935610791847526e+17,
738
- "train_batch_size": 32,
739
- "trial_name": null,
740
- "trial_params": null
741
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/checkpoint-50606/trainer_state.json DELETED
@@ -1,741 +0,0 @@
1
- {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 2.0,
6
- "eval_steps": 500,
7
- "global_step": 50606,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.01976050270718887,
14
- "grad_norm": 0.40364792943000793,
15
- "learning_rate": 4.951092755799708e-05,
16
- "loss": 2.1373,
17
- "step": 500
18
- },
19
- {
20
- "epoch": 0.03952100541437774,
21
- "grad_norm": 0.5190012454986572,
22
- "learning_rate": 4.901691499031736e-05,
23
- "loss": 1.1731,
24
- "step": 1000
25
- },
26
- {
27
- "epoch": 0.059281508121566615,
28
- "grad_norm": 0.44221577048301697,
29
- "learning_rate": 4.8522902422637634e-05,
30
- "loss": 0.9673,
31
- "step": 1500
32
- },
33
- {
34
- "epoch": 0.07904201082875548,
35
- "grad_norm": 0.3572881817817688,
36
- "learning_rate": 4.8028889854957916e-05,
37
- "loss": 0.8537,
38
- "step": 2000
39
- },
40
- {
41
- "epoch": 0.09880251353594435,
42
- "grad_norm": 0.36681538820266724,
43
- "learning_rate": 4.753487728727819e-05,
44
- "loss": 0.77,
45
- "step": 2500
46
- },
47
- {
48
- "epoch": 0.11856301624313323,
49
- "grad_norm": 0.3278830647468567,
50
- "learning_rate": 4.7040864719598474e-05,
51
- "loss": 0.7379,
52
- "step": 3000
53
- },
54
- {
55
- "epoch": 0.1383235189503221,
56
- "grad_norm": 0.32520031929016113,
57
- "learning_rate": 4.654685215191875e-05,
58
- "loss": 0.6918,
59
- "step": 3500
60
- },
61
- {
62
- "epoch": 0.15808402165751095,
63
- "grad_norm": 0.3115929961204529,
64
- "learning_rate": 4.6052839584239024e-05,
65
- "loss": 0.6635,
66
- "step": 4000
67
- },
68
- {
69
- "epoch": 0.17784452436469983,
70
- "grad_norm": 0.309794157743454,
71
- "learning_rate": 4.55588270165593e-05,
72
- "loss": 0.6363,
73
- "step": 4500
74
- },
75
- {
76
- "epoch": 0.1976050270718887,
77
- "grad_norm": 0.3163657784461975,
78
- "learning_rate": 4.506481444887958e-05,
79
- "loss": 0.6113,
80
- "step": 5000
81
- },
82
- {
83
- "epoch": 0.21736552977907758,
84
- "grad_norm": 0.27902352809906006,
85
- "learning_rate": 4.457080188119986e-05,
86
- "loss": 0.5899,
87
- "step": 5500
88
- },
89
- {
90
- "epoch": 0.23712603248626646,
91
- "grad_norm": 0.26167234778404236,
92
- "learning_rate": 4.407678931352014e-05,
93
- "loss": 0.58,
94
- "step": 6000
95
- },
96
- {
97
- "epoch": 0.25688653519345533,
98
- "grad_norm": 0.30986905097961426,
99
- "learning_rate": 4.3582776745840415e-05,
100
- "loss": 0.5632,
101
- "step": 6500
102
- },
103
- {
104
- "epoch": 0.2766470379006442,
105
- "grad_norm": 0.28008660674095154,
106
- "learning_rate": 4.30887641781607e-05,
107
- "loss": 0.5478,
108
- "step": 7000
109
- },
110
- {
111
- "epoch": 0.2964075406078331,
112
- "grad_norm": 0.2860545516014099,
113
- "learning_rate": 4.259475161048097e-05,
114
- "loss": 0.541,
115
- "step": 7500
116
- },
117
- {
118
- "epoch": 0.3161680433150219,
119
- "grad_norm": 0.23841875791549683,
120
- "learning_rate": 4.2100739042801254e-05,
121
- "loss": 0.5261,
122
- "step": 8000
123
- },
124
- {
125
- "epoch": 0.3359285460222108,
126
- "grad_norm": 0.24955500662326813,
127
- "learning_rate": 4.160672647512153e-05,
128
- "loss": 0.5184,
129
- "step": 8500
130
- },
131
- {
132
- "epoch": 0.35568904872939966,
133
- "grad_norm": 0.28650936484336853,
134
- "learning_rate": 4.1112713907441805e-05,
135
- "loss": 0.5092,
136
- "step": 9000
137
- },
138
- {
139
- "epoch": 0.37544955143658854,
140
- "grad_norm": 0.26088303327560425,
141
- "learning_rate": 4.061870133976208e-05,
142
- "loss": 0.4939,
143
- "step": 9500
144
- },
145
- {
146
- "epoch": 0.3952100541437774,
147
- "grad_norm": 0.27843499183654785,
148
- "learning_rate": 4.012468877208236e-05,
149
- "loss": 0.493,
150
- "step": 10000
151
- },
152
- {
153
- "epoch": 0.4149705568509663,
154
- "grad_norm": 0.45379650592803955,
155
- "learning_rate": 3.963067620440264e-05,
156
- "loss": 0.4824,
157
- "step": 10500
158
- },
159
- {
160
- "epoch": 0.43473105955815516,
161
- "grad_norm": 0.29186248779296875,
162
- "learning_rate": 3.913666363672292e-05,
163
- "loss": 0.4743,
164
- "step": 11000
165
- },
166
- {
167
- "epoch": 0.45449156226534404,
168
- "grad_norm": 0.2711990475654602,
169
- "learning_rate": 3.86426510690432e-05,
170
- "loss": 0.4684,
171
- "step": 11500
172
- },
173
- {
174
- "epoch": 0.4742520649725329,
175
- "grad_norm": 0.2682012915611267,
176
- "learning_rate": 3.814863850136348e-05,
177
- "loss": 0.4674,
178
- "step": 12000
179
- },
180
- {
181
- "epoch": 0.4940125676797218,
182
- "grad_norm": 0.3144526183605194,
183
- "learning_rate": 3.765462593368376e-05,
184
- "loss": 0.4565,
185
- "step": 12500
186
- },
187
- {
188
- "epoch": 0.5137730703869107,
189
- "grad_norm": 0.24828797578811646,
190
- "learning_rate": 3.7160613366004034e-05,
191
- "loss": 0.4601,
192
- "step": 13000
193
- },
194
- {
195
- "epoch": 0.5335335730940995,
196
- "grad_norm": 0.23556892573833466,
197
- "learning_rate": 3.6666600798324317e-05,
198
- "loss": 0.4456,
199
- "step": 13500
200
- },
201
- {
202
- "epoch": 0.5532940758012884,
203
- "grad_norm": 0.20168079435825348,
204
- "learning_rate": 3.617258823064459e-05,
205
- "loss": 0.4416,
206
- "step": 14000
207
- },
208
- {
209
- "epoch": 0.5730545785084773,
210
- "grad_norm": 0.26364487409591675,
211
- "learning_rate": 3.567857566296487e-05,
212
- "loss": 0.4394,
213
- "step": 14500
214
- },
215
- {
216
- "epoch": 0.5928150812156662,
217
- "grad_norm": 0.22534222900867462,
218
- "learning_rate": 3.518456309528514e-05,
219
- "loss": 0.4371,
220
- "step": 15000
221
- },
222
- {
223
- "epoch": 0.612575583922855,
224
- "grad_norm": 0.2668949067592621,
225
- "learning_rate": 3.4690550527605425e-05,
226
- "loss": 0.4292,
227
- "step": 15500
228
- },
229
- {
230
- "epoch": 0.6323360866300438,
231
- "grad_norm": 0.26472756266593933,
232
- "learning_rate": 3.41965379599257e-05,
233
- "loss": 0.4271,
234
- "step": 16000
235
- },
236
- {
237
- "epoch": 0.6520965893372327,
238
- "grad_norm": 0.238133504986763,
239
- "learning_rate": 3.3703513417381335e-05,
240
- "loss": 0.4214,
241
- "step": 16500
242
- },
243
- {
244
- "epoch": 0.6718570920444216,
245
- "grad_norm": 0.19885075092315674,
246
- "learning_rate": 3.320950084970162e-05,
247
- "loss": 0.4178,
248
- "step": 17000
249
- },
250
- {
251
- "epoch": 0.6916175947516104,
252
- "grad_norm": 0.2647489309310913,
253
- "learning_rate": 3.271548828202189e-05,
254
- "loss": 0.4171,
255
- "step": 17500
256
- },
257
- {
258
- "epoch": 0.7113780974587993,
259
- "grad_norm": 0.2713819444179535,
260
- "learning_rate": 3.2221475714342175e-05,
261
- "loss": 0.4152,
262
- "step": 18000
263
- },
264
- {
265
- "epoch": 0.7311386001659882,
266
- "grad_norm": 0.21879780292510986,
267
- "learning_rate": 3.172845117179781e-05,
268
- "loss": 0.4124,
269
- "step": 18500
270
- },
271
- {
272
- "epoch": 0.7508991028731771,
273
- "grad_norm": 0.21214431524276733,
274
- "learning_rate": 3.1234438604118085e-05,
275
- "loss": 0.4046,
276
- "step": 19000
277
- },
278
- {
279
- "epoch": 0.770659605580366,
280
- "grad_norm": 0.23132909834384918,
281
- "learning_rate": 3.074042603643837e-05,
282
- "loss": 0.4074,
283
- "step": 19500
284
- },
285
- {
286
- "epoch": 0.7904201082875548,
287
- "grad_norm": 0.25402024388313293,
288
- "learning_rate": 3.0246413468758646e-05,
289
- "loss": 0.4025,
290
- "step": 20000
291
- },
292
- {
293
- "epoch": 0.8101806109947437,
294
- "grad_norm": 0.24344465136528015,
295
- "learning_rate": 2.9754376951349644e-05,
296
- "loss": 0.3954,
297
- "step": 20500
298
- },
299
- {
300
- "epoch": 0.8299411137019326,
301
- "grad_norm": 0.35887858271598816,
302
- "learning_rate": 2.926036438366992e-05,
303
- "loss": 0.3999,
304
- "step": 21000
305
- },
306
- {
307
- "epoch": 0.8497016164091215,
308
- "grad_norm": 0.2312784641981125,
309
- "learning_rate": 2.87663518159902e-05,
310
- "loss": 0.3953,
311
- "step": 21500
312
- },
313
- {
314
- "epoch": 0.8694621191163103,
315
- "grad_norm": 0.25101277232170105,
316
- "learning_rate": 2.8273327273445837e-05,
317
- "loss": 0.394,
318
- "step": 22000
319
- },
320
- {
321
- "epoch": 0.8892226218234992,
322
- "grad_norm": 0.2582937479019165,
323
- "learning_rate": 2.777931470576612e-05,
324
- "loss": 0.3936,
325
- "step": 22500
326
- },
327
- {
328
- "epoch": 0.9089831245306881,
329
- "grad_norm": 0.20885373651981354,
330
- "learning_rate": 2.7285302138086394e-05,
331
- "loss": 0.384,
332
- "step": 23000
333
- },
334
- {
335
- "epoch": 0.928743627237877,
336
- "grad_norm": 0.23662053048610687,
337
- "learning_rate": 2.6791289570406676e-05,
338
- "loss": 0.3818,
339
- "step": 23500
340
- },
341
- {
342
- "epoch": 0.9485041299450658,
343
- "grad_norm": 0.23784968256950378,
344
- "learning_rate": 2.629727700272695e-05,
345
- "loss": 0.3884,
346
- "step": 24000
347
- },
348
- {
349
- "epoch": 0.9682646326522547,
350
- "grad_norm": 0.24405881762504578,
351
- "learning_rate": 2.5804252460182587e-05,
352
- "loss": 0.384,
353
- "step": 24500
354
- },
355
- {
356
- "epoch": 0.9880251353594436,
357
- "grad_norm": 0.18989978730678558,
358
- "learning_rate": 2.531023989250287e-05,
359
- "loss": 0.3821,
360
- "step": 25000
361
- },
362
- {
363
- "epoch": 1.0077856380666323,
364
- "grad_norm": 0.22834104299545288,
365
- "learning_rate": 2.4816227324823144e-05,
366
- "loss": 0.3801,
367
- "step": 25500
368
- },
369
- {
370
- "epoch": 1.0275461407738213,
371
- "grad_norm": 0.30803126096725464,
372
- "learning_rate": 2.4322214757143423e-05,
373
- "loss": 0.3739,
374
- "step": 26000
375
- },
376
- {
377
- "epoch": 1.04730664348101,
378
- "grad_norm": 0.21231615543365479,
379
- "learning_rate": 2.3828202189463698e-05,
380
- "loss": 0.3791,
381
- "step": 26500
382
- },
383
- {
384
- "epoch": 1.067067146188199,
385
- "grad_norm": 0.26202794909477234,
386
- "learning_rate": 2.333418962178398e-05,
387
- "loss": 0.3739,
388
- "step": 27000
389
- },
390
- {
391
- "epoch": 1.0868276488953879,
392
- "grad_norm": 0.21842999756336212,
393
- "learning_rate": 2.284017705410426e-05,
394
- "loss": 0.377,
395
- "step": 27500
396
- },
397
- {
398
- "epoch": 1.1065881516025768,
399
- "grad_norm": 0.22890307009220123,
400
- "learning_rate": 2.2346164486424538e-05,
401
- "loss": 0.3733,
402
- "step": 28000
403
- },
404
- {
405
- "epoch": 1.1263486543097656,
406
- "grad_norm": 0.20813824236392975,
407
- "learning_rate": 2.1852151918744816e-05,
408
- "loss": 0.37,
409
- "step": 28500
410
- },
411
- {
412
- "epoch": 1.1461091570169546,
413
- "grad_norm": 0.2490423172712326,
414
- "learning_rate": 2.1358139351065092e-05,
415
- "loss": 0.3704,
416
- "step": 29000
417
- },
418
- {
419
- "epoch": 1.1658696597241434,
420
- "grad_norm": 0.20076008141040802,
421
- "learning_rate": 2.086511480852073e-05,
422
- "loss": 0.3733,
423
- "step": 29500
424
- },
425
- {
426
- "epoch": 1.1856301624313323,
427
- "grad_norm": 0.20791497826576233,
428
- "learning_rate": 2.037110224084101e-05,
429
- "loss": 0.3682,
430
- "step": 30000
431
- },
432
- {
433
- "epoch": 1.2053906651385211,
434
- "grad_norm": 0.3393056094646454,
435
- "learning_rate": 1.9878077698296648e-05,
436
- "loss": 0.3687,
437
- "step": 30500
438
- },
439
- {
440
- "epoch": 1.22515116784571,
441
- "grad_norm": 0.21395719051361084,
442
- "learning_rate": 1.9384065130616923e-05,
443
- "loss": 0.3649,
444
- "step": 31000
445
- },
446
- {
447
- "epoch": 1.2449116705528989,
448
- "grad_norm": 0.23558488488197327,
449
- "learning_rate": 1.889104058807256e-05,
450
- "loss": 0.366,
451
- "step": 31500
452
- },
453
- {
454
- "epoch": 1.2646721732600876,
455
- "grad_norm": 0.2056606411933899,
456
- "learning_rate": 1.839702802039284e-05,
457
- "loss": 0.363,
458
- "step": 32000
459
- },
460
- {
461
- "epoch": 1.2844326759672766,
462
- "grad_norm": 0.2141130417585373,
463
- "learning_rate": 1.790400347784848e-05,
464
- "loss": 0.3671,
465
- "step": 32500
466
- },
467
- {
468
- "epoch": 1.3041931786744656,
469
- "grad_norm": 0.20828750729560852,
470
- "learning_rate": 1.7409990910168754e-05,
471
- "loss": 0.3635,
472
- "step": 33000
473
- },
474
- {
475
- "epoch": 1.3239536813816544,
476
- "grad_norm": 0.18149369955062866,
477
- "learning_rate": 1.6915978342489033e-05,
478
- "loss": 0.3637,
479
- "step": 33500
480
- },
481
- {
482
- "epoch": 1.3437141840888431,
483
- "grad_norm": 0.21469901502132416,
484
- "learning_rate": 1.642196577480931e-05,
485
- "loss": 0.3586,
486
- "step": 34000
487
- },
488
- {
489
- "epoch": 1.3634746867960321,
490
- "grad_norm": 0.22413115203380585,
491
- "learning_rate": 1.592894123226495e-05,
492
- "loss": 0.3596,
493
- "step": 34500
494
- },
495
- {
496
- "epoch": 1.3832351895032209,
497
- "grad_norm": 0.2113995999097824,
498
- "learning_rate": 1.543492866458523e-05,
499
- "loss": 0.3547,
500
- "step": 35000
501
- },
502
- {
503
- "epoch": 1.4029956922104099,
504
- "grad_norm": 0.19711661338806152,
505
- "learning_rate": 1.4940916096905506e-05,
506
- "loss": 0.3601,
507
- "step": 35500
508
- },
509
- {
510
- "epoch": 1.4227561949175986,
511
- "grad_norm": 0.24169065058231354,
512
- "learning_rate": 1.4447891554361142e-05,
513
- "loss": 0.3582,
514
- "step": 36000
515
- },
516
- {
517
- "epoch": 1.4425166976247876,
518
- "grad_norm": 0.2239445596933365,
519
- "learning_rate": 1.3953878986681421e-05,
520
- "loss": 0.358,
521
- "step": 36500
522
- },
523
- {
524
- "epoch": 1.4622772003319764,
525
- "grad_norm": 0.25448787212371826,
526
- "learning_rate": 1.3459866419001698e-05,
527
- "loss": 0.3573,
528
- "step": 37000
529
- },
530
- {
531
- "epoch": 1.4820377030391654,
532
- "grad_norm": 0.20836737751960754,
533
- "learning_rate": 1.296585385132198e-05,
534
- "loss": 0.3533,
535
- "step": 37500
536
- },
537
- {
538
- "epoch": 1.5017982057463541,
539
- "grad_norm": 0.18163880705833435,
540
- "learning_rate": 1.2471841283642256e-05,
541
- "loss": 0.3564,
542
- "step": 38000
543
- },
544
- {
545
- "epoch": 1.5215587084535431,
546
- "grad_norm": 0.23112539947032928,
547
- "learning_rate": 1.1978816741097894e-05,
548
- "loss": 0.3516,
549
- "step": 38500
550
- },
551
- {
552
- "epoch": 1.541319211160732,
553
- "grad_norm": 0.18698620796203613,
554
- "learning_rate": 1.1484804173418171e-05,
555
- "loss": 0.3534,
556
- "step": 39000
557
- },
558
- {
559
- "epoch": 1.5610797138679207,
560
- "grad_norm": 0.2288573682308197,
561
- "learning_rate": 1.099079160573845e-05,
562
- "loss": 0.3527,
563
- "step": 39500
564
- },
565
- {
566
- "epoch": 1.5808402165751096,
567
- "grad_norm": 0.22100871801376343,
568
- "learning_rate": 1.0496779038058729e-05,
569
- "loss": 0.3504,
570
- "step": 40000
571
- },
572
- {
573
- "epoch": 1.6006007192822986,
574
- "grad_norm": 0.24465763568878174,
575
- "learning_rate": 1.0002766470379007e-05,
576
- "loss": 0.3526,
577
- "step": 40500
578
- },
579
- {
580
- "epoch": 1.6203612219894874,
581
- "grad_norm": 0.22944742441177368,
582
- "learning_rate": 9.508753902699286e-06,
583
- "loss": 0.3544,
584
- "step": 41000
585
- },
586
- {
587
- "epoch": 1.6401217246966762,
588
- "grad_norm": 0.22519248723983765,
589
- "learning_rate": 9.015729360154923e-06,
590
- "loss": 0.3507,
591
- "step": 41500
592
- },
593
- {
594
- "epoch": 1.6598822274038652,
595
- "grad_norm": 0.21818549931049347,
596
- "learning_rate": 8.521716792475202e-06,
597
- "loss": 0.3447,
598
- "step": 42000
599
- },
600
- {
601
- "epoch": 1.6796427301110541,
602
- "grad_norm": 0.23688144981861115,
603
- "learning_rate": 8.027704224795479e-06,
604
- "loss": 0.3485,
605
- "step": 42500
606
- },
607
- {
608
- "epoch": 1.699403232818243,
609
- "grad_norm": 0.2965475916862488,
610
- "learning_rate": 7.533691657115757e-06,
611
- "loss": 0.3482,
612
- "step": 43000
613
- },
614
- {
615
- "epoch": 1.7191637355254317,
616
- "grad_norm": 0.21588334441184998,
617
- "learning_rate": 7.039679089436035e-06,
618
- "loss": 0.3487,
619
- "step": 43500
620
- },
621
- {
622
- "epoch": 1.7389242382326207,
623
- "grad_norm": 0.25196942687034607,
624
- "learning_rate": 6.545666521756313e-06,
625
- "loss": 0.347,
626
- "step": 44000
627
- },
628
- {
629
- "epoch": 1.7586847409398096,
630
- "grad_norm": 0.2386660873889923,
631
- "learning_rate": 6.051653954076593e-06,
632
- "loss": 0.3493,
633
- "step": 44500
634
- },
635
- {
636
- "epoch": 1.7784452436469984,
637
- "grad_norm": 0.21771515905857086,
638
- "learning_rate": 5.5586294115322294e-06,
639
- "loss": 0.3412,
640
- "step": 45000
641
- },
642
- {
643
- "epoch": 1.7982057463541872,
644
- "grad_norm": 0.20925763249397278,
645
- "learning_rate": 5.064616843852508e-06,
646
- "loss": 0.3451,
647
- "step": 45500
648
- },
649
- {
650
- "epoch": 1.8179662490613762,
651
- "grad_norm": 0.21637707948684692,
652
- "learning_rate": 4.570604276172786e-06,
653
- "loss": 0.3483,
654
- "step": 46000
655
- },
656
- {
657
- "epoch": 1.8377267517685651,
658
- "grad_norm": 0.21756745874881744,
659
- "learning_rate": 4.076591708493064e-06,
660
- "loss": 0.3518,
661
- "step": 46500
662
- },
663
- {
664
- "epoch": 1.857487254475754,
665
- "grad_norm": 0.21370700001716614,
666
- "learning_rate": 3.583567165948702e-06,
667
- "loss": 0.3489,
668
- "step": 47000
669
- },
670
- {
671
- "epoch": 1.8772477571829427,
672
- "grad_norm": 0.27830684185028076,
673
- "learning_rate": 3.0895545982689803e-06,
674
- "loss": 0.3451,
675
- "step": 47500
676
- },
677
- {
678
- "epoch": 1.8970082598901317,
679
- "grad_norm": 0.225153848528862,
680
- "learning_rate": 2.595542030589258e-06,
681
- "loss": 0.3451,
682
- "step": 48000
683
- },
684
- {
685
- "epoch": 1.9167687625973204,
686
- "grad_norm": 0.2039685696363449,
687
- "learning_rate": 2.1015294629095364e-06,
688
- "loss": 0.349,
689
- "step": 48500
690
- },
691
- {
692
- "epoch": 1.9365292653045092,
693
- "grad_norm": 0.3599139153957367,
694
- "learning_rate": 1.607516895229815e-06,
695
- "loss": 0.3486,
696
- "step": 49000
697
- },
698
- {
699
- "epoch": 1.9562897680116982,
700
- "grad_norm": 0.2884109616279602,
701
- "learning_rate": 1.113504327550093e-06,
702
- "loss": 0.343,
703
- "step": 49500
704
- },
705
- {
706
- "epoch": 1.9760502707188872,
707
- "grad_norm": 0.2130923867225647,
708
- "learning_rate": 6.194917598703712e-07,
709
- "loss": 0.347,
710
- "step": 50000
711
- },
712
- {
713
- "epoch": 1.995810773426076,
714
- "grad_norm": 0.21081362664699554,
715
- "learning_rate": 1.2547919219064935e-07,
716
- "loss": 0.3495,
717
- "step": 50500
718
- }
719
- ],
720
- "logging_steps": 500,
721
- "max_steps": 50606,
722
- "num_input_tokens_seen": 0,
723
- "num_train_epochs": 2,
724
- "save_steps": 500,
725
- "stateful_callbacks": {
726
- "TrainerControl": {
727
- "args": {
728
- "should_epoch_stop": false,
729
- "should_evaluate": false,
730
- "should_log": false,
731
- "should_save": true,
732
- "should_training_stop": true
733
- },
734
- "attributes": {}
735
- }
736
- },
737
- "total_flos": 1.0958557994287104e+17,
738
- "train_batch_size": 32,
739
- "trial_name": null,
740
- "trial_params": null
741
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/{checkpoint-50000 → checkpoint-61500}/config.json RENAMED
File without changes
checkpoints/{checkpoint-50000 → checkpoint-61500}/generation_config.json RENAMED
File without changes
checkpoints/{checkpoint-50000 → checkpoint-61500}/model.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e382d9710542b95305776750804e034c780abde907f1317589c50130eecef7e
3
  size 242041896
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08bb2f5549ca7a4686fa163f40a3a840a02556dd674b562f51a7d4b3bbb25446
3
  size 242041896
checkpoints/{checkpoint-50500 → checkpoint-61500}/optimizer.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:da7544c70bb05dda6d3e421d4d01f80ccbfc8d0263e0633d9f6459ad741b8d9b
3
  size 484163514
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3c0b1cfae7064c3e583c81fc7b89d7d3c6a5688579e6e54be3d400f2a760681
3
  size 484163514
checkpoints/{checkpoint-50606 → checkpoint-61500}/rng_state.pth RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0db62a295b881cbdac891ce60eb24e88bbd9d8e5506d709022b20a86fb1c0684
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:451ec28fae2c51302e3d439040b8e5e6eb5b4c0d1c58af17230cc30adbfb2190
3
  size 14244
checkpoints/{checkpoint-50606 → checkpoint-61500}/scaler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:986c45e57db5c2e3196ab438a5d7b1c65b4d0fc415f78119e7394ea22edcec1a
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f9ec04d900cba2a2e93335548c511413a07c2ec81cdb8a6699d3923fe215e69
3
  size 988
checkpoints/{checkpoint-50000 → checkpoint-61500}/scheduler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:39f4cec1053f49ae11f7ce465776e88cf7c41cc5c33de785dcded6ad1f106627
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:006aa6abdcde538a5090b2a8f8114d416f5dd26532791f36d1980c632d584d30
3
  size 1064
checkpoints/{checkpoint-50000 → checkpoint-61500}/special_tokens_map.json RENAMED
File without changes
checkpoints/{checkpoint-50000 → checkpoint-61500}/spiece.model RENAMED
File without changes
checkpoints/{checkpoint-50000 → checkpoint-61500}/tokenizer.json RENAMED
File without changes
checkpoints/{checkpoint-50000 → checkpoint-61500}/tokenizer_config.json RENAMED
File without changes
checkpoints/checkpoint-61500/trainer_state.json ADDED
@@ -0,0 +1,895 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.9766021726553964,
6
+ "eval_steps": 500,
7
+ "global_step": 61500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.016069936363052,
14
+ "grad_norm": 0.3969729542732239,
15
+ "learning_rate": 4.960146557819631e-05,
16
+ "loss": 2.05,
17
+ "step": 500
18
+ },
19
+ {
20
+ "epoch": 0.032139872726104,
21
+ "grad_norm": 0.3822907507419586,
22
+ "learning_rate": 4.919971716912001e-05,
23
+ "loss": 1.1207,
24
+ "step": 1000
25
+ },
26
+ {
27
+ "epoch": 0.04820980908915601,
28
+ "grad_norm": 0.36019280552864075,
29
+ "learning_rate": 4.879796876004371e-05,
30
+ "loss": 0.9225,
31
+ "step": 1500
32
+ },
33
+ {
34
+ "epoch": 0.064279745452208,
35
+ "grad_norm": 0.30364033579826355,
36
+ "learning_rate": 4.8396220350967415e-05,
37
+ "loss": 0.8244,
38
+ "step": 2000
39
+ },
40
+ {
41
+ "epoch": 0.08034968181526002,
42
+ "grad_norm": 0.45634394884109497,
43
+ "learning_rate": 4.799447194189111e-05,
44
+ "loss": 0.7506,
45
+ "step": 2500
46
+ },
47
+ {
48
+ "epoch": 0.09641961817831202,
49
+ "grad_norm": 0.3562425374984741,
50
+ "learning_rate": 4.759272353281481e-05,
51
+ "loss": 0.7012,
52
+ "step": 3000
53
+ },
54
+ {
55
+ "epoch": 0.11248955454136401,
56
+ "grad_norm": 0.33726808428764343,
57
+ "learning_rate": 4.719097512373851e-05,
58
+ "loss": 0.6706,
59
+ "step": 3500
60
+ },
61
+ {
62
+ "epoch": 0.128559490904416,
63
+ "grad_norm": 0.30098849534988403,
64
+ "learning_rate": 4.678922671466221e-05,
65
+ "loss": 0.6308,
66
+ "step": 4000
67
+ },
68
+ {
69
+ "epoch": 0.14462942726746802,
70
+ "grad_norm": 0.29443585872650146,
71
+ "learning_rate": 4.6387478305585915e-05,
72
+ "loss": 0.6141,
73
+ "step": 4500
74
+ },
75
+ {
76
+ "epoch": 0.16069936363052004,
77
+ "grad_norm": 0.25647810101509094,
78
+ "learning_rate": 4.598572989650961e-05,
79
+ "loss": 0.5866,
80
+ "step": 5000
81
+ },
82
+ {
83
+ "epoch": 0.17676929999357202,
84
+ "grad_norm": 0.2516370415687561,
85
+ "learning_rate": 4.558398148743331e-05,
86
+ "loss": 0.5665,
87
+ "step": 5500
88
+ },
89
+ {
90
+ "epoch": 0.19283923635662403,
91
+ "grad_norm": 0.3337278366088867,
92
+ "learning_rate": 4.518223307835701e-05,
93
+ "loss": 0.5427,
94
+ "step": 6000
95
+ },
96
+ {
97
+ "epoch": 0.20890917271967602,
98
+ "grad_norm": 0.2592964470386505,
99
+ "learning_rate": 4.478048466928072e-05,
100
+ "loss": 0.5323,
101
+ "step": 6500
102
+ },
103
+ {
104
+ "epoch": 0.22497910908272803,
105
+ "grad_norm": 0.28550606966018677,
106
+ "learning_rate": 4.437873626020441e-05,
107
+ "loss": 0.5187,
108
+ "step": 7000
109
+ },
110
+ {
111
+ "epoch": 0.24104904544578004,
112
+ "grad_norm": 0.26474013924598694,
113
+ "learning_rate": 4.397698785112811e-05,
114
+ "loss": 0.5058,
115
+ "step": 7500
116
+ },
117
+ {
118
+ "epoch": 0.257118981808832,
119
+ "grad_norm": 0.3018198013305664,
120
+ "learning_rate": 4.3575239442051814e-05,
121
+ "loss": 0.5013,
122
+ "step": 8000
123
+ },
124
+ {
125
+ "epoch": 0.27318891817188407,
126
+ "grad_norm": 0.2628585994243622,
127
+ "learning_rate": 4.317349103297551e-05,
128
+ "loss": 0.4883,
129
+ "step": 8500
130
+ },
131
+ {
132
+ "epoch": 0.28925885453493605,
133
+ "grad_norm": 0.30172979831695557,
134
+ "learning_rate": 4.277174262389921e-05,
135
+ "loss": 0.4795,
136
+ "step": 9000
137
+ },
138
+ {
139
+ "epoch": 0.30532879089798803,
140
+ "grad_norm": 0.25293004512786865,
141
+ "learning_rate": 4.236999421482291e-05,
142
+ "loss": 0.4682,
143
+ "step": 9500
144
+ },
145
+ {
146
+ "epoch": 0.3213987272610401,
147
+ "grad_norm": 0.2726214528083801,
148
+ "learning_rate": 4.196824580574661e-05,
149
+ "loss": 0.4641,
150
+ "step": 10000
151
+ },
152
+ {
153
+ "epoch": 0.33746866362409206,
154
+ "grad_norm": 0.2570224106311798,
155
+ "learning_rate": 4.1566497396670314e-05,
156
+ "loss": 0.4556,
157
+ "step": 10500
158
+ },
159
+ {
160
+ "epoch": 0.35353859998714404,
161
+ "grad_norm": 0.26380738615989685,
162
+ "learning_rate": 4.1164748987594006e-05,
163
+ "loss": 0.449,
164
+ "step": 11000
165
+ },
166
+ {
167
+ "epoch": 0.369608536350196,
168
+ "grad_norm": 0.2555176913738251,
169
+ "learning_rate": 4.076300057851771e-05,
170
+ "loss": 0.4412,
171
+ "step": 11500
172
+ },
173
+ {
174
+ "epoch": 0.38567847271324807,
175
+ "grad_norm": 0.2122594565153122,
176
+ "learning_rate": 4.036125216944141e-05,
177
+ "loss": 0.4365,
178
+ "step": 12000
179
+ },
180
+ {
181
+ "epoch": 0.40174840907630005,
182
+ "grad_norm": 0.2333071529865265,
183
+ "learning_rate": 3.9959503760365116e-05,
184
+ "loss": 0.433,
185
+ "step": 12500
186
+ },
187
+ {
188
+ "epoch": 0.41781834543935203,
189
+ "grad_norm": 0.24873752892017365,
190
+ "learning_rate": 3.955775535128881e-05,
191
+ "loss": 0.4283,
192
+ "step": 13000
193
+ },
194
+ {
195
+ "epoch": 0.4338882818024041,
196
+ "grad_norm": 0.32416871190071106,
197
+ "learning_rate": 3.915600694221251e-05,
198
+ "loss": 0.4218,
199
+ "step": 13500
200
+ },
201
+ {
202
+ "epoch": 0.44995821816545606,
203
+ "grad_norm": 0.23515433073043823,
204
+ "learning_rate": 3.875425853313621e-05,
205
+ "loss": 0.4139,
206
+ "step": 14000
207
+ },
208
+ {
209
+ "epoch": 0.46602815452850804,
210
+ "grad_norm": 0.22002151608467102,
211
+ "learning_rate": 3.8353313620878064e-05,
212
+ "loss": 0.417,
213
+ "step": 14500
214
+ },
215
+ {
216
+ "epoch": 0.4820980908915601,
217
+ "grad_norm": 0.251897931098938,
218
+ "learning_rate": 3.795156521180176e-05,
219
+ "loss": 0.4106,
220
+ "step": 15000
221
+ },
222
+ {
223
+ "epoch": 0.49816802725461207,
224
+ "grad_norm": 0.26212435960769653,
225
+ "learning_rate": 3.754981680272546e-05,
226
+ "loss": 0.4037,
227
+ "step": 15500
228
+ },
229
+ {
230
+ "epoch": 0.514237963617664,
231
+ "grad_norm": 0.2718159258365631,
232
+ "learning_rate": 3.714887189046731e-05,
233
+ "loss": 0.402,
234
+ "step": 16000
235
+ },
236
+ {
237
+ "epoch": 0.530307899980716,
238
+ "grad_norm": 0.23812739551067352,
239
+ "learning_rate": 3.674712348139102e-05,
240
+ "loss": 0.3953,
241
+ "step": 16500
242
+ },
243
+ {
244
+ "epoch": 0.5463778363437681,
245
+ "grad_norm": 0.21076083183288574,
246
+ "learning_rate": 3.634537507231471e-05,
247
+ "loss": 0.3938,
248
+ "step": 17000
249
+ },
250
+ {
251
+ "epoch": 0.5624477727068201,
252
+ "grad_norm": 0.25489869713783264,
253
+ "learning_rate": 3.5943626663238416e-05,
254
+ "loss": 0.3921,
255
+ "step": 17500
256
+ },
257
+ {
258
+ "epoch": 0.5785177090698721,
259
+ "grad_norm": 0.24057357013225555,
260
+ "learning_rate": 3.5541878254162115e-05,
261
+ "loss": 0.3867,
262
+ "step": 18000
263
+ },
264
+ {
265
+ "epoch": 0.5945876454329241,
266
+ "grad_norm": 0.24298915266990662,
267
+ "learning_rate": 3.514012984508582e-05,
268
+ "loss": 0.3868,
269
+ "step": 18500
270
+ },
271
+ {
272
+ "epoch": 0.6106575817959761,
273
+ "grad_norm": 0.2183919996023178,
274
+ "learning_rate": 3.473838143600951e-05,
275
+ "loss": 0.3803,
276
+ "step": 19000
277
+ },
278
+ {
279
+ "epoch": 0.626727518159028,
280
+ "grad_norm": 0.2278251349925995,
281
+ "learning_rate": 3.433663302693321e-05,
282
+ "loss": 0.3775,
283
+ "step": 19500
284
+ },
285
+ {
286
+ "epoch": 0.6427974545220801,
287
+ "grad_norm": 0.240201935172081,
288
+ "learning_rate": 3.393568811467507e-05,
289
+ "loss": 0.3751,
290
+ "step": 20000
291
+ },
292
+ {
293
+ "epoch": 0.6588673908851321,
294
+ "grad_norm": 0.21118561923503876,
295
+ "learning_rate": 3.353393970559877e-05,
296
+ "loss": 0.3742,
297
+ "step": 20500
298
+ },
299
+ {
300
+ "epoch": 0.6749373272481841,
301
+ "grad_norm": 0.22640825808048248,
302
+ "learning_rate": 3.313219129652247e-05,
303
+ "loss": 0.3729,
304
+ "step": 21000
305
+ },
306
+ {
307
+ "epoch": 0.6910072636112361,
308
+ "grad_norm": 0.23105542361736298,
309
+ "learning_rate": 3.2730442887446166e-05,
310
+ "loss": 0.3687,
311
+ "step": 21500
312
+ },
313
+ {
314
+ "epoch": 0.7070771999742881,
315
+ "grad_norm": 0.24791008234024048,
316
+ "learning_rate": 3.2329497975188024e-05,
317
+ "loss": 0.3658,
318
+ "step": 22000
319
+ },
320
+ {
321
+ "epoch": 0.7231471363373401,
322
+ "grad_norm": 0.2497881054878235,
323
+ "learning_rate": 3.1928553062929875e-05,
324
+ "loss": 0.3646,
325
+ "step": 22500
326
+ },
327
+ {
328
+ "epoch": 0.739217072700392,
329
+ "grad_norm": 0.2395261973142624,
330
+ "learning_rate": 3.152680465385357e-05,
331
+ "loss": 0.3655,
332
+ "step": 23000
333
+ },
334
+ {
335
+ "epoch": 0.7552870090634441,
336
+ "grad_norm": 0.21194589138031006,
337
+ "learning_rate": 3.112505624477727e-05,
338
+ "loss": 0.3646,
339
+ "step": 23500
340
+ },
341
+ {
342
+ "epoch": 0.7713569454264961,
343
+ "grad_norm": 0.21682508289813995,
344
+ "learning_rate": 3.072330783570097e-05,
345
+ "loss": 0.3629,
346
+ "step": 24000
347
+ },
348
+ {
349
+ "epoch": 0.7874268817895481,
350
+ "grad_norm": 0.23710566759109497,
351
+ "learning_rate": 3.0321559426624674e-05,
352
+ "loss": 0.3583,
353
+ "step": 24500
354
+ },
355
+ {
356
+ "epoch": 0.8034968181526001,
357
+ "grad_norm": 0.23857219517230988,
358
+ "learning_rate": 2.9919811017548372e-05,
359
+ "loss": 0.3561,
360
+ "step": 25000
361
+ },
362
+ {
363
+ "epoch": 0.8195667545156521,
364
+ "grad_norm": 0.241951584815979,
365
+ "learning_rate": 2.9518062608472075e-05,
366
+ "loss": 0.3537,
367
+ "step": 25500
368
+ },
369
+ {
370
+ "epoch": 0.8356366908787041,
371
+ "grad_norm": 0.275765061378479,
372
+ "learning_rate": 2.9116314199395773e-05,
373
+ "loss": 0.3493,
374
+ "step": 26000
375
+ },
376
+ {
377
+ "epoch": 0.8517066272417562,
378
+ "grad_norm": 0.24757184088230133,
379
+ "learning_rate": 2.871536928713762e-05,
380
+ "loss": 0.3486,
381
+ "step": 26500
382
+ },
383
+ {
384
+ "epoch": 0.8677765636048081,
385
+ "grad_norm": 0.21833688020706177,
386
+ "learning_rate": 2.8313620878061327e-05,
387
+ "loss": 0.3461,
388
+ "step": 27000
389
+ },
390
+ {
391
+ "epoch": 0.8838464999678601,
392
+ "grad_norm": 0.21623168885707855,
393
+ "learning_rate": 2.7911872468985022e-05,
394
+ "loss": 0.3468,
395
+ "step": 27500
396
+ },
397
+ {
398
+ "epoch": 0.8999164363309121,
399
+ "grad_norm": 0.20861521363258362,
400
+ "learning_rate": 2.7510124059908728e-05,
401
+ "loss": 0.3481,
402
+ "step": 28000
403
+ },
404
+ {
405
+ "epoch": 0.9159863726939641,
406
+ "grad_norm": 0.20291315019130707,
407
+ "learning_rate": 2.7108375650832423e-05,
408
+ "loss": 0.3474,
409
+ "step": 28500
410
+ },
411
+ {
412
+ "epoch": 0.9320563090570161,
413
+ "grad_norm": 0.2101660966873169,
414
+ "learning_rate": 2.6707430738574275e-05,
415
+ "loss": 0.3412,
416
+ "step": 29000
417
+ },
418
+ {
419
+ "epoch": 0.9481262454200682,
420
+ "grad_norm": 0.23224739730358124,
421
+ "learning_rate": 2.6305682329497977e-05,
422
+ "loss": 0.3422,
423
+ "step": 29500
424
+ },
425
+ {
426
+ "epoch": 0.9641961817831202,
427
+ "grad_norm": 0.22987599670886993,
428
+ "learning_rate": 2.5903933920421676e-05,
429
+ "loss": 0.3407,
430
+ "step": 30000
431
+ },
432
+ {
433
+ "epoch": 0.9802661181461721,
434
+ "grad_norm": 0.22307533025741577,
435
+ "learning_rate": 2.5502185511345378e-05,
436
+ "loss": 0.3365,
437
+ "step": 30500
438
+ },
439
+ {
440
+ "epoch": 0.9963360545092241,
441
+ "grad_norm": 0.20577801764011383,
442
+ "learning_rate": 2.510124059908723e-05,
443
+ "loss": 0.3409,
444
+ "step": 31000
445
+ },
446
+ {
447
+ "epoch": 1.0124059908722762,
448
+ "grad_norm": 0.23968417942523956,
449
+ "learning_rate": 2.4699492190010928e-05,
450
+ "loss": 0.339,
451
+ "step": 31500
452
+ },
453
+ {
454
+ "epoch": 1.028475927235328,
455
+ "grad_norm": 0.2166174054145813,
456
+ "learning_rate": 2.429774378093463e-05,
457
+ "loss": 0.3317,
458
+ "step": 32000
459
+ },
460
+ {
461
+ "epoch": 1.0445458635983802,
462
+ "grad_norm": 0.22259151935577393,
463
+ "learning_rate": 2.389599537185833e-05,
464
+ "loss": 0.3404,
465
+ "step": 32500
466
+ },
467
+ {
468
+ "epoch": 1.060615799961432,
469
+ "grad_norm": 0.2585219442844391,
470
+ "learning_rate": 2.3495050459600184e-05,
471
+ "loss": 0.3322,
472
+ "step": 33000
473
+ },
474
+ {
475
+ "epoch": 1.0766857363244842,
476
+ "grad_norm": 0.23949937522411346,
477
+ "learning_rate": 2.3093302050523882e-05,
478
+ "loss": 0.3332,
479
+ "step": 33500
480
+ },
481
+ {
482
+ "epoch": 1.0927556726875363,
483
+ "grad_norm": 0.2360944151878357,
484
+ "learning_rate": 2.269155364144758e-05,
485
+ "loss": 0.3374,
486
+ "step": 34000
487
+ },
488
+ {
489
+ "epoch": 1.1088256090505881,
490
+ "grad_norm": 0.23383018374443054,
491
+ "learning_rate": 2.228980523237128e-05,
492
+ "loss": 0.3287,
493
+ "step": 34500
494
+ },
495
+ {
496
+ "epoch": 1.1248955454136402,
497
+ "grad_norm": 0.25602060556411743,
498
+ "learning_rate": 2.1888860320113135e-05,
499
+ "loss": 0.3262,
500
+ "step": 35000
501
+ },
502
+ {
503
+ "epoch": 1.140965481776692,
504
+ "grad_norm": 0.2233658730983734,
505
+ "learning_rate": 2.1487111911036833e-05,
506
+ "loss": 0.3294,
507
+ "step": 35500
508
+ },
509
+ {
510
+ "epoch": 1.1570354181397442,
511
+ "grad_norm": 0.23545712232589722,
512
+ "learning_rate": 2.1085363501960532e-05,
513
+ "loss": 0.3263,
514
+ "step": 36000
515
+ },
516
+ {
517
+ "epoch": 1.173105354502796,
518
+ "grad_norm": 0.22479598224163055,
519
+ "learning_rate": 2.0683615092884234e-05,
520
+ "loss": 0.328,
521
+ "step": 36500
522
+ },
523
+ {
524
+ "epoch": 1.1891752908658482,
525
+ "grad_norm": 0.22207121551036835,
526
+ "learning_rate": 2.0282670180626086e-05,
527
+ "loss": 0.3275,
528
+ "step": 37000
529
+ },
530
+ {
531
+ "epoch": 1.2052452272289003,
532
+ "grad_norm": 0.23822110891342163,
533
+ "learning_rate": 1.9880921771549785e-05,
534
+ "loss": 0.3273,
535
+ "step": 37500
536
+ },
537
+ {
538
+ "epoch": 1.2213151635919521,
539
+ "grad_norm": 0.23664866387844086,
540
+ "learning_rate": 1.9479173362473487e-05,
541
+ "loss": 0.318,
542
+ "step": 38000
543
+ },
544
+ {
545
+ "epoch": 1.2373850999550042,
546
+ "grad_norm": 0.18543508648872375,
547
+ "learning_rate": 1.9077424953397185e-05,
548
+ "loss": 0.3235,
549
+ "step": 38500
550
+ },
551
+ {
552
+ "epoch": 1.253455036318056,
553
+ "grad_norm": 0.23305822908878326,
554
+ "learning_rate": 1.8676480041139037e-05,
555
+ "loss": 0.3243,
556
+ "step": 39000
557
+ },
558
+ {
559
+ "epoch": 1.2695249726811082,
560
+ "grad_norm": 0.21699073910713196,
561
+ "learning_rate": 1.827473163206274e-05,
562
+ "loss": 0.3222,
563
+ "step": 39500
564
+ },
565
+ {
566
+ "epoch": 1.28559490904416,
567
+ "grad_norm": 0.2757895588874817,
568
+ "learning_rate": 1.7872983222986438e-05,
569
+ "loss": 0.3248,
570
+ "step": 40000
571
+ },
572
+ {
573
+ "epoch": 1.3016648454072122,
574
+ "grad_norm": 0.19769324362277985,
575
+ "learning_rate": 1.7471234813910137e-05,
576
+ "loss": 0.3179,
577
+ "step": 40500
578
+ },
579
+ {
580
+ "epoch": 1.3177347817702643,
581
+ "grad_norm": 0.18964402377605438,
582
+ "learning_rate": 1.707028990165199e-05,
583
+ "loss": 0.3178,
584
+ "step": 41000
585
+ },
586
+ {
587
+ "epoch": 1.3338047181333161,
588
+ "grad_norm": 0.2584107220172882,
589
+ "learning_rate": 1.666854149257569e-05,
590
+ "loss": 0.318,
591
+ "step": 41500
592
+ },
593
+ {
594
+ "epoch": 1.3498746544963682,
595
+ "grad_norm": 0.25919750332832336,
596
+ "learning_rate": 1.626759658031754e-05,
597
+ "loss": 0.3205,
598
+ "step": 42000
599
+ },
600
+ {
601
+ "epoch": 1.3659445908594203,
602
+ "grad_norm": 0.24371759593486786,
603
+ "learning_rate": 1.5865848171241244e-05,
604
+ "loss": 0.3186,
605
+ "step": 42500
606
+ },
607
+ {
608
+ "epoch": 1.3820145272224722,
609
+ "grad_norm": 0.24457883834838867,
610
+ "learning_rate": 1.5464099762164942e-05,
611
+ "loss": 0.3162,
612
+ "step": 43000
613
+ },
614
+ {
615
+ "epoch": 1.398084463585524,
616
+ "grad_norm": 0.1918337345123291,
617
+ "learning_rate": 1.5062351353088641e-05,
618
+ "loss": 0.3169,
619
+ "step": 43500
620
+ },
621
+ {
622
+ "epoch": 1.4141543999485762,
623
+ "grad_norm": 0.2350657880306244,
624
+ "learning_rate": 1.4660602944012342e-05,
625
+ "loss": 0.3171,
626
+ "step": 44000
627
+ },
628
+ {
629
+ "epoch": 1.4302243363116283,
630
+ "grad_norm": 0.2481279820203781,
631
+ "learning_rate": 1.4258854534936042e-05,
632
+ "loss": 0.3179,
633
+ "step": 44500
634
+ },
635
+ {
636
+ "epoch": 1.4462942726746801,
637
+ "grad_norm": 0.21132701635360718,
638
+ "learning_rate": 1.3857106125859743e-05,
639
+ "loss": 0.3125,
640
+ "step": 45000
641
+ },
642
+ {
643
+ "epoch": 1.4623642090377322,
644
+ "grad_norm": 0.20240716636180878,
645
+ "learning_rate": 1.3455357716783443e-05,
646
+ "loss": 0.3172,
647
+ "step": 45500
648
+ },
649
+ {
650
+ "epoch": 1.4784341454007843,
651
+ "grad_norm": 0.2224823385477066,
652
+ "learning_rate": 1.3054412804525296e-05,
653
+ "loss": 0.3151,
654
+ "step": 46000
655
+ },
656
+ {
657
+ "epoch": 1.4945040817638362,
658
+ "grad_norm": 0.19261781871318817,
659
+ "learning_rate": 1.2652664395448997e-05,
660
+ "loss": 0.312,
661
+ "step": 46500
662
+ },
663
+ {
664
+ "epoch": 1.510574018126888,
665
+ "grad_norm": 0.16068917512893677,
666
+ "learning_rate": 1.2250915986372695e-05,
667
+ "loss": 0.3145,
668
+ "step": 47000
669
+ },
670
+ {
671
+ "epoch": 1.5266439544899402,
672
+ "grad_norm": 0.18192972242832184,
673
+ "learning_rate": 1.1849167577296394e-05,
674
+ "loss": 0.3134,
675
+ "step": 47500
676
+ },
677
+ {
678
+ "epoch": 1.5427138908529923,
679
+ "grad_norm": 0.19884943962097168,
680
+ "learning_rate": 1.1448222665038247e-05,
681
+ "loss": 0.3119,
682
+ "step": 48000
683
+ },
684
+ {
685
+ "epoch": 1.5587838272160441,
686
+ "grad_norm": 0.1883106529712677,
687
+ "learning_rate": 1.1046474255961948e-05,
688
+ "loss": 0.316,
689
+ "step": 48500
690
+ },
691
+ {
692
+ "epoch": 1.5748537635790962,
693
+ "grad_norm": 0.19331087172031403,
694
+ "learning_rate": 1.0644725846885646e-05,
695
+ "loss": 0.3135,
696
+ "step": 49000
697
+ },
698
+ {
699
+ "epoch": 1.5909236999421483,
700
+ "grad_norm": 0.20041531324386597,
701
+ "learning_rate": 1.0242977437809347e-05,
702
+ "loss": 0.3112,
703
+ "step": 49500
704
+ },
705
+ {
706
+ "epoch": 1.6069936363052002,
707
+ "grad_norm": 0.18530187010765076,
708
+ "learning_rate": 9.8420325255512e-06,
709
+ "loss": 0.3122,
710
+ "step": 50000
711
+ },
712
+ {
713
+ "epoch": 1.623063572668252,
714
+ "grad_norm": 0.22725620865821838,
715
+ "learning_rate": 9.4402841164749e-06,
716
+ "loss": 0.3122,
717
+ "step": 50500
718
+ },
719
+ {
720
+ "epoch": 1.6391335090313044,
721
+ "grad_norm": 0.23093479871749878,
722
+ "learning_rate": 9.0385357073986e-06,
723
+ "loss": 0.3149,
724
+ "step": 51000
725
+ },
726
+ {
727
+ "epoch": 1.6552034453943563,
728
+ "grad_norm": 0.19580845534801483,
729
+ "learning_rate": 8.6367872983223e-06,
730
+ "loss": 0.3121,
731
+ "step": 51500
732
+ },
733
+ {
734
+ "epoch": 1.6712733817574081,
735
+ "grad_norm": 0.1742846667766571,
736
+ "learning_rate": 8.235842386064153e-06,
737
+ "loss": 0.3094,
738
+ "step": 52000
739
+ },
740
+ {
741
+ "epoch": 1.6873433181204602,
742
+ "grad_norm": 0.18685191869735718,
743
+ "learning_rate": 7.834093976987852e-06,
744
+ "loss": 0.309,
745
+ "step": 52500
746
+ },
747
+ {
748
+ "epoch": 1.7034132544835123,
749
+ "grad_norm": 0.21959276497364044,
750
+ "learning_rate": 7.432345567911551e-06,
751
+ "loss": 0.3118,
752
+ "step": 53000
753
+ },
754
+ {
755
+ "epoch": 1.7194831908465642,
756
+ "grad_norm": 0.1935770958662033,
757
+ "learning_rate": 7.030597158835252e-06,
758
+ "loss": 0.3106,
759
+ "step": 53500
760
+ },
761
+ {
762
+ "epoch": 1.7355531272096163,
763
+ "grad_norm": 0.19977129995822906,
764
+ "learning_rate": 6.629652246577103e-06,
765
+ "loss": 0.3101,
766
+ "step": 54000
767
+ },
768
+ {
769
+ "epoch": 1.7516230635726684,
770
+ "grad_norm": 0.2006288766860962,
771
+ "learning_rate": 6.2279038375008035e-06,
772
+ "loss": 0.3099,
773
+ "step": 54500
774
+ },
775
+ {
776
+ "epoch": 1.7676929999357203,
777
+ "grad_norm": 0.19280743598937988,
778
+ "learning_rate": 5.826155428424504e-06,
779
+ "loss": 0.308,
780
+ "step": 55000
781
+ },
782
+ {
783
+ "epoch": 1.7837629362987721,
784
+ "grad_norm": 0.22095157206058502,
785
+ "learning_rate": 5.424407019348204e-06,
786
+ "loss": 0.3069,
787
+ "step": 55500
788
+ },
789
+ {
790
+ "epoch": 1.7998328726618242,
791
+ "grad_norm": 0.2091740071773529,
792
+ "learning_rate": 5.022658610271903e-06,
793
+ "loss": 0.3062,
794
+ "step": 56000
795
+ },
796
+ {
797
+ "epoch": 1.8159028090248763,
798
+ "grad_norm": 0.24772244691848755,
799
+ "learning_rate": 4.620910201195604e-06,
800
+ "loss": 0.3093,
801
+ "step": 56500
802
+ },
803
+ {
804
+ "epoch": 1.8319727453879282,
805
+ "grad_norm": 0.1973961740732193,
806
+ "learning_rate": 4.219161792119303e-06,
807
+ "loss": 0.309,
808
+ "step": 57000
809
+ },
810
+ {
811
+ "epoch": 1.8480426817509803,
812
+ "grad_norm": 0.22767914831638336,
813
+ "learning_rate": 3.817413383043003e-06,
814
+ "loss": 0.3109,
815
+ "step": 57500
816
+ },
817
+ {
818
+ "epoch": 1.8641126181140324,
819
+ "grad_norm": 0.21461111307144165,
820
+ "learning_rate": 3.416468470784856e-06,
821
+ "loss": 0.3075,
822
+ "step": 58000
823
+ },
824
+ {
825
+ "epoch": 1.8801825544770843,
826
+ "grad_norm": 0.24607454240322113,
827
+ "learning_rate": 3.0147200617085557e-06,
828
+ "loss": 0.3058,
829
+ "step": 58500
830
+ },
831
+ {
832
+ "epoch": 1.8962524908401361,
833
+ "grad_norm": 0.19667118787765503,
834
+ "learning_rate": 2.6129716526322558e-06,
835
+ "loss": 0.3072,
836
+ "step": 59000
837
+ },
838
+ {
839
+ "epoch": 1.9123224272031882,
840
+ "grad_norm": 0.22604137659072876,
841
+ "learning_rate": 2.211223243555956e-06,
842
+ "loss": 0.3064,
843
+ "step": 59500
844
+ },
845
+ {
846
+ "epoch": 1.9283923635662403,
847
+ "grad_norm": 0.1879967898130417,
848
+ "learning_rate": 1.8102783312978082e-06,
849
+ "loss": 0.3063,
850
+ "step": 60000
851
+ },
852
+ {
853
+ "epoch": 1.9444622999292922,
854
+ "grad_norm": 0.21271295845508575,
855
+ "learning_rate": 1.408529922221508e-06,
856
+ "loss": 0.3076,
857
+ "step": 60500
858
+ },
859
+ {
860
+ "epoch": 1.9605322362923443,
861
+ "grad_norm": 0.16714586317539215,
862
+ "learning_rate": 1.006781513145208e-06,
863
+ "loss": 0.3092,
864
+ "step": 61000
865
+ },
866
+ {
867
+ "epoch": 1.9766021726553964,
868
+ "grad_norm": 0.20666128396987915,
869
+ "learning_rate": 6.050331040689079e-07,
870
+ "loss": 0.3076,
871
+ "step": 61500
872
+ }
873
+ ],
874
+ "logging_steps": 500,
875
+ "max_steps": 62228,
876
+ "num_input_tokens_seen": 0,
877
+ "num_train_epochs": 2,
878
+ "save_steps": 500,
879
+ "stateful_callbacks": {
880
+ "TrainerControl": {
881
+ "args": {
882
+ "should_epoch_stop": false,
883
+ "should_evaluate": false,
884
+ "should_log": false,
885
+ "should_save": true,
886
+ "should_training_stop": false
887
+ },
888
+ "attributes": {}
889
+ }
890
+ },
891
+ "total_flos": 1.3317619730664653e+17,
892
+ "train_batch_size": 32,
893
+ "trial_name": null,
894
+ "trial_params": null
895
+ }
checkpoints/{checkpoint-50000 → checkpoint-61500}/training_args.bin RENAMED
File without changes
checkpoints/{checkpoint-50500 → checkpoint-62000}/config.json RENAMED
File without changes
checkpoints/{checkpoint-50500 → checkpoint-62000}/generation_config.json RENAMED
File without changes
checkpoints/{checkpoint-50500 → checkpoint-62000}/model.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:539eecb51da20c6c594586682f50a31d97e95255185c19f683edecc847571985
3
  size 242041896
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21d2693f103a4b1c9756dd65ffa057b16bedd52438d1857e7abe6cc6bbbdc118
3
  size 242041896
checkpoints/{checkpoint-50000 → checkpoint-62000}/optimizer.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a43b301226252047fe58f3dadcccc852d7a46f76eec9bae148f1aff077c42eeb
3
  size 484163514
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d67c69f9573de5a8dadf16db4aeb6d985098267281696ebb224c45bc379b1125
3
  size 484163514
checkpoints/{checkpoint-50500 → checkpoint-62000}/rng_state.pth RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bc37bb34d81b591803fb94a0dd3c5ba8a76fe1e8c1af6c781300f1adb8675452
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19b09152c000e511e6ddafcdda96b8df75e3964a1a6d6a59502f1c6b09d5600b
3
  size 14244
checkpoints/{checkpoint-50500 → checkpoint-62000}/scaler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4ff50001dca923c16ec2b0024a83128c7c05b8afd5850a1ef829f6b51043372b
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:654ff04f86d41e38c4c564123d36adbd0e83e01bd73996a4587ead262cae63cb
3
  size 988
checkpoints/{checkpoint-50606 → checkpoint-62000}/scheduler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7eca8eb5f88ac27cb1600eeb99f4f0af9058c5a926e84e618a726dd2f0d5a8b3
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7785e8be4cba2ff2308ebbf74c247c1df9ee9867422dbbc081226579438090ce
3
  size 1064
checkpoints/{checkpoint-50500 → checkpoint-62000}/special_tokens_map.json RENAMED
File without changes
checkpoints/{checkpoint-50500 → checkpoint-62000}/spiece.model RENAMED
File without changes
checkpoints/{checkpoint-50500 → checkpoint-62000}/tokenizer.json RENAMED
File without changes
checkpoints/{checkpoint-50500 → checkpoint-62000}/tokenizer_config.json RENAMED
File without changes
checkpoints/checkpoint-62000/trainer_state.json ADDED
@@ -0,0 +1,902 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.9926721090184483,
6
+ "eval_steps": 500,
7
+ "global_step": 62000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.016069936363052,
14
+ "grad_norm": 0.3969729542732239,
15
+ "learning_rate": 4.960146557819631e-05,
16
+ "loss": 2.05,
17
+ "step": 500
18
+ },
19
+ {
20
+ "epoch": 0.032139872726104,
21
+ "grad_norm": 0.3822907507419586,
22
+ "learning_rate": 4.919971716912001e-05,
23
+ "loss": 1.1207,
24
+ "step": 1000
25
+ },
26
+ {
27
+ "epoch": 0.04820980908915601,
28
+ "grad_norm": 0.36019280552864075,
29
+ "learning_rate": 4.879796876004371e-05,
30
+ "loss": 0.9225,
31
+ "step": 1500
32
+ },
33
+ {
34
+ "epoch": 0.064279745452208,
35
+ "grad_norm": 0.30364033579826355,
36
+ "learning_rate": 4.8396220350967415e-05,
37
+ "loss": 0.8244,
38
+ "step": 2000
39
+ },
40
+ {
41
+ "epoch": 0.08034968181526002,
42
+ "grad_norm": 0.45634394884109497,
43
+ "learning_rate": 4.799447194189111e-05,
44
+ "loss": 0.7506,
45
+ "step": 2500
46
+ },
47
+ {
48
+ "epoch": 0.09641961817831202,
49
+ "grad_norm": 0.3562425374984741,
50
+ "learning_rate": 4.759272353281481e-05,
51
+ "loss": 0.7012,
52
+ "step": 3000
53
+ },
54
+ {
55
+ "epoch": 0.11248955454136401,
56
+ "grad_norm": 0.33726808428764343,
57
+ "learning_rate": 4.719097512373851e-05,
58
+ "loss": 0.6706,
59
+ "step": 3500
60
+ },
61
+ {
62
+ "epoch": 0.128559490904416,
63
+ "grad_norm": 0.30098849534988403,
64
+ "learning_rate": 4.678922671466221e-05,
65
+ "loss": 0.6308,
66
+ "step": 4000
67
+ },
68
+ {
69
+ "epoch": 0.14462942726746802,
70
+ "grad_norm": 0.29443585872650146,
71
+ "learning_rate": 4.6387478305585915e-05,
72
+ "loss": 0.6141,
73
+ "step": 4500
74
+ },
75
+ {
76
+ "epoch": 0.16069936363052004,
77
+ "grad_norm": 0.25647810101509094,
78
+ "learning_rate": 4.598572989650961e-05,
79
+ "loss": 0.5866,
80
+ "step": 5000
81
+ },
82
+ {
83
+ "epoch": 0.17676929999357202,
84
+ "grad_norm": 0.2516370415687561,
85
+ "learning_rate": 4.558398148743331e-05,
86
+ "loss": 0.5665,
87
+ "step": 5500
88
+ },
89
+ {
90
+ "epoch": 0.19283923635662403,
91
+ "grad_norm": 0.3337278366088867,
92
+ "learning_rate": 4.518223307835701e-05,
93
+ "loss": 0.5427,
94
+ "step": 6000
95
+ },
96
+ {
97
+ "epoch": 0.20890917271967602,
98
+ "grad_norm": 0.2592964470386505,
99
+ "learning_rate": 4.478048466928072e-05,
100
+ "loss": 0.5323,
101
+ "step": 6500
102
+ },
103
+ {
104
+ "epoch": 0.22497910908272803,
105
+ "grad_norm": 0.28550606966018677,
106
+ "learning_rate": 4.437873626020441e-05,
107
+ "loss": 0.5187,
108
+ "step": 7000
109
+ },
110
+ {
111
+ "epoch": 0.24104904544578004,
112
+ "grad_norm": 0.26474013924598694,
113
+ "learning_rate": 4.397698785112811e-05,
114
+ "loss": 0.5058,
115
+ "step": 7500
116
+ },
117
+ {
118
+ "epoch": 0.257118981808832,
119
+ "grad_norm": 0.3018198013305664,
120
+ "learning_rate": 4.3575239442051814e-05,
121
+ "loss": 0.5013,
122
+ "step": 8000
123
+ },
124
+ {
125
+ "epoch": 0.27318891817188407,
126
+ "grad_norm": 0.2628585994243622,
127
+ "learning_rate": 4.317349103297551e-05,
128
+ "loss": 0.4883,
129
+ "step": 8500
130
+ },
131
+ {
132
+ "epoch": 0.28925885453493605,
133
+ "grad_norm": 0.30172979831695557,
134
+ "learning_rate": 4.277174262389921e-05,
135
+ "loss": 0.4795,
136
+ "step": 9000
137
+ },
138
+ {
139
+ "epoch": 0.30532879089798803,
140
+ "grad_norm": 0.25293004512786865,
141
+ "learning_rate": 4.236999421482291e-05,
142
+ "loss": 0.4682,
143
+ "step": 9500
144
+ },
145
+ {
146
+ "epoch": 0.3213987272610401,
147
+ "grad_norm": 0.2726214528083801,
148
+ "learning_rate": 4.196824580574661e-05,
149
+ "loss": 0.4641,
150
+ "step": 10000
151
+ },
152
+ {
153
+ "epoch": 0.33746866362409206,
154
+ "grad_norm": 0.2570224106311798,
155
+ "learning_rate": 4.1566497396670314e-05,
156
+ "loss": 0.4556,
157
+ "step": 10500
158
+ },
159
+ {
160
+ "epoch": 0.35353859998714404,
161
+ "grad_norm": 0.26380738615989685,
162
+ "learning_rate": 4.1164748987594006e-05,
163
+ "loss": 0.449,
164
+ "step": 11000
165
+ },
166
+ {
167
+ "epoch": 0.369608536350196,
168
+ "grad_norm": 0.2555176913738251,
169
+ "learning_rate": 4.076300057851771e-05,
170
+ "loss": 0.4412,
171
+ "step": 11500
172
+ },
173
+ {
174
+ "epoch": 0.38567847271324807,
175
+ "grad_norm": 0.2122594565153122,
176
+ "learning_rate": 4.036125216944141e-05,
177
+ "loss": 0.4365,
178
+ "step": 12000
179
+ },
180
+ {
181
+ "epoch": 0.40174840907630005,
182
+ "grad_norm": 0.2333071529865265,
183
+ "learning_rate": 3.9959503760365116e-05,
184
+ "loss": 0.433,
185
+ "step": 12500
186
+ },
187
+ {
188
+ "epoch": 0.41781834543935203,
189
+ "grad_norm": 0.24873752892017365,
190
+ "learning_rate": 3.955775535128881e-05,
191
+ "loss": 0.4283,
192
+ "step": 13000
193
+ },
194
+ {
195
+ "epoch": 0.4338882818024041,
196
+ "grad_norm": 0.32416871190071106,
197
+ "learning_rate": 3.915600694221251e-05,
198
+ "loss": 0.4218,
199
+ "step": 13500
200
+ },
201
+ {
202
+ "epoch": 0.44995821816545606,
203
+ "grad_norm": 0.23515433073043823,
204
+ "learning_rate": 3.875425853313621e-05,
205
+ "loss": 0.4139,
206
+ "step": 14000
207
+ },
208
+ {
209
+ "epoch": 0.46602815452850804,
210
+ "grad_norm": 0.22002151608467102,
211
+ "learning_rate": 3.8353313620878064e-05,
212
+ "loss": 0.417,
213
+ "step": 14500
214
+ },
215
+ {
216
+ "epoch": 0.4820980908915601,
217
+ "grad_norm": 0.251897931098938,
218
+ "learning_rate": 3.795156521180176e-05,
219
+ "loss": 0.4106,
220
+ "step": 15000
221
+ },
222
+ {
223
+ "epoch": 0.49816802725461207,
224
+ "grad_norm": 0.26212435960769653,
225
+ "learning_rate": 3.754981680272546e-05,
226
+ "loss": 0.4037,
227
+ "step": 15500
228
+ },
229
+ {
230
+ "epoch": 0.514237963617664,
231
+ "grad_norm": 0.2718159258365631,
232
+ "learning_rate": 3.714887189046731e-05,
233
+ "loss": 0.402,
234
+ "step": 16000
235
+ },
236
+ {
237
+ "epoch": 0.530307899980716,
238
+ "grad_norm": 0.23812739551067352,
239
+ "learning_rate": 3.674712348139102e-05,
240
+ "loss": 0.3953,
241
+ "step": 16500
242
+ },
243
+ {
244
+ "epoch": 0.5463778363437681,
245
+ "grad_norm": 0.21076083183288574,
246
+ "learning_rate": 3.634537507231471e-05,
247
+ "loss": 0.3938,
248
+ "step": 17000
249
+ },
250
+ {
251
+ "epoch": 0.5624477727068201,
252
+ "grad_norm": 0.25489869713783264,
253
+ "learning_rate": 3.5943626663238416e-05,
254
+ "loss": 0.3921,
255
+ "step": 17500
256
+ },
257
+ {
258
+ "epoch": 0.5785177090698721,
259
+ "grad_norm": 0.24057357013225555,
260
+ "learning_rate": 3.5541878254162115e-05,
261
+ "loss": 0.3867,
262
+ "step": 18000
263
+ },
264
+ {
265
+ "epoch": 0.5945876454329241,
266
+ "grad_norm": 0.24298915266990662,
267
+ "learning_rate": 3.514012984508582e-05,
268
+ "loss": 0.3868,
269
+ "step": 18500
270
+ },
271
+ {
272
+ "epoch": 0.6106575817959761,
273
+ "grad_norm": 0.2183919996023178,
274
+ "learning_rate": 3.473838143600951e-05,
275
+ "loss": 0.3803,
276
+ "step": 19000
277
+ },
278
+ {
279
+ "epoch": 0.626727518159028,
280
+ "grad_norm": 0.2278251349925995,
281
+ "learning_rate": 3.433663302693321e-05,
282
+ "loss": 0.3775,
283
+ "step": 19500
284
+ },
285
+ {
286
+ "epoch": 0.6427974545220801,
287
+ "grad_norm": 0.240201935172081,
288
+ "learning_rate": 3.393568811467507e-05,
289
+ "loss": 0.3751,
290
+ "step": 20000
291
+ },
292
+ {
293
+ "epoch": 0.6588673908851321,
294
+ "grad_norm": 0.21118561923503876,
295
+ "learning_rate": 3.353393970559877e-05,
296
+ "loss": 0.3742,
297
+ "step": 20500
298
+ },
299
+ {
300
+ "epoch": 0.6749373272481841,
301
+ "grad_norm": 0.22640825808048248,
302
+ "learning_rate": 3.313219129652247e-05,
303
+ "loss": 0.3729,
304
+ "step": 21000
305
+ },
306
+ {
307
+ "epoch": 0.6910072636112361,
308
+ "grad_norm": 0.23105542361736298,
309
+ "learning_rate": 3.2730442887446166e-05,
310
+ "loss": 0.3687,
311
+ "step": 21500
312
+ },
313
+ {
314
+ "epoch": 0.7070771999742881,
315
+ "grad_norm": 0.24791008234024048,
316
+ "learning_rate": 3.2329497975188024e-05,
317
+ "loss": 0.3658,
318
+ "step": 22000
319
+ },
320
+ {
321
+ "epoch": 0.7231471363373401,
322
+ "grad_norm": 0.2497881054878235,
323
+ "learning_rate": 3.1928553062929875e-05,
324
+ "loss": 0.3646,
325
+ "step": 22500
326
+ },
327
+ {
328
+ "epoch": 0.739217072700392,
329
+ "grad_norm": 0.2395261973142624,
330
+ "learning_rate": 3.152680465385357e-05,
331
+ "loss": 0.3655,
332
+ "step": 23000
333
+ },
334
+ {
335
+ "epoch": 0.7552870090634441,
336
+ "grad_norm": 0.21194589138031006,
337
+ "learning_rate": 3.112505624477727e-05,
338
+ "loss": 0.3646,
339
+ "step": 23500
340
+ },
341
+ {
342
+ "epoch": 0.7713569454264961,
343
+ "grad_norm": 0.21682508289813995,
344
+ "learning_rate": 3.072330783570097e-05,
345
+ "loss": 0.3629,
346
+ "step": 24000
347
+ },
348
+ {
349
+ "epoch": 0.7874268817895481,
350
+ "grad_norm": 0.23710566759109497,
351
+ "learning_rate": 3.0321559426624674e-05,
352
+ "loss": 0.3583,
353
+ "step": 24500
354
+ },
355
+ {
356
+ "epoch": 0.8034968181526001,
357
+ "grad_norm": 0.23857219517230988,
358
+ "learning_rate": 2.9919811017548372e-05,
359
+ "loss": 0.3561,
360
+ "step": 25000
361
+ },
362
+ {
363
+ "epoch": 0.8195667545156521,
364
+ "grad_norm": 0.241951584815979,
365
+ "learning_rate": 2.9518062608472075e-05,
366
+ "loss": 0.3537,
367
+ "step": 25500
368
+ },
369
+ {
370
+ "epoch": 0.8356366908787041,
371
+ "grad_norm": 0.275765061378479,
372
+ "learning_rate": 2.9116314199395773e-05,
373
+ "loss": 0.3493,
374
+ "step": 26000
375
+ },
376
+ {
377
+ "epoch": 0.8517066272417562,
378
+ "grad_norm": 0.24757184088230133,
379
+ "learning_rate": 2.871536928713762e-05,
380
+ "loss": 0.3486,
381
+ "step": 26500
382
+ },
383
+ {
384
+ "epoch": 0.8677765636048081,
385
+ "grad_norm": 0.21833688020706177,
386
+ "learning_rate": 2.8313620878061327e-05,
387
+ "loss": 0.3461,
388
+ "step": 27000
389
+ },
390
+ {
391
+ "epoch": 0.8838464999678601,
392
+ "grad_norm": 0.21623168885707855,
393
+ "learning_rate": 2.7911872468985022e-05,
394
+ "loss": 0.3468,
395
+ "step": 27500
396
+ },
397
+ {
398
+ "epoch": 0.8999164363309121,
399
+ "grad_norm": 0.20861521363258362,
400
+ "learning_rate": 2.7510124059908728e-05,
401
+ "loss": 0.3481,
402
+ "step": 28000
403
+ },
404
+ {
405
+ "epoch": 0.9159863726939641,
406
+ "grad_norm": 0.20291315019130707,
407
+ "learning_rate": 2.7108375650832423e-05,
408
+ "loss": 0.3474,
409
+ "step": 28500
410
+ },
411
+ {
412
+ "epoch": 0.9320563090570161,
413
+ "grad_norm": 0.2101660966873169,
414
+ "learning_rate": 2.6707430738574275e-05,
415
+ "loss": 0.3412,
416
+ "step": 29000
417
+ },
418
+ {
419
+ "epoch": 0.9481262454200682,
420
+ "grad_norm": 0.23224739730358124,
421
+ "learning_rate": 2.6305682329497977e-05,
422
+ "loss": 0.3422,
423
+ "step": 29500
424
+ },
425
+ {
426
+ "epoch": 0.9641961817831202,
427
+ "grad_norm": 0.22987599670886993,
428
+ "learning_rate": 2.5903933920421676e-05,
429
+ "loss": 0.3407,
430
+ "step": 30000
431
+ },
432
+ {
433
+ "epoch": 0.9802661181461721,
434
+ "grad_norm": 0.22307533025741577,
435
+ "learning_rate": 2.5502185511345378e-05,
436
+ "loss": 0.3365,
437
+ "step": 30500
438
+ },
439
+ {
440
+ "epoch": 0.9963360545092241,
441
+ "grad_norm": 0.20577801764011383,
442
+ "learning_rate": 2.510124059908723e-05,
443
+ "loss": 0.3409,
444
+ "step": 31000
445
+ },
446
+ {
447
+ "epoch": 1.0124059908722762,
448
+ "grad_norm": 0.23968417942523956,
449
+ "learning_rate": 2.4699492190010928e-05,
450
+ "loss": 0.339,
451
+ "step": 31500
452
+ },
453
+ {
454
+ "epoch": 1.028475927235328,
455
+ "grad_norm": 0.2166174054145813,
456
+ "learning_rate": 2.429774378093463e-05,
457
+ "loss": 0.3317,
458
+ "step": 32000
459
+ },
460
+ {
461
+ "epoch": 1.0445458635983802,
462
+ "grad_norm": 0.22259151935577393,
463
+ "learning_rate": 2.389599537185833e-05,
464
+ "loss": 0.3404,
465
+ "step": 32500
466
+ },
467
+ {
468
+ "epoch": 1.060615799961432,
469
+ "grad_norm": 0.2585219442844391,
470
+ "learning_rate": 2.3495050459600184e-05,
471
+ "loss": 0.3322,
472
+ "step": 33000
473
+ },
474
+ {
475
+ "epoch": 1.0766857363244842,
476
+ "grad_norm": 0.23949937522411346,
477
+ "learning_rate": 2.3093302050523882e-05,
478
+ "loss": 0.3332,
479
+ "step": 33500
480
+ },
481
+ {
482
+ "epoch": 1.0927556726875363,
483
+ "grad_norm": 0.2360944151878357,
484
+ "learning_rate": 2.269155364144758e-05,
485
+ "loss": 0.3374,
486
+ "step": 34000
487
+ },
488
+ {
489
+ "epoch": 1.1088256090505881,
490
+ "grad_norm": 0.23383018374443054,
491
+ "learning_rate": 2.228980523237128e-05,
492
+ "loss": 0.3287,
493
+ "step": 34500
494
+ },
495
+ {
496
+ "epoch": 1.1248955454136402,
497
+ "grad_norm": 0.25602060556411743,
498
+ "learning_rate": 2.1888860320113135e-05,
499
+ "loss": 0.3262,
500
+ "step": 35000
501
+ },
502
+ {
503
+ "epoch": 1.140965481776692,
504
+ "grad_norm": 0.2233658730983734,
505
+ "learning_rate": 2.1487111911036833e-05,
506
+ "loss": 0.3294,
507
+ "step": 35500
508
+ },
509
+ {
510
+ "epoch": 1.1570354181397442,
511
+ "grad_norm": 0.23545712232589722,
512
+ "learning_rate": 2.1085363501960532e-05,
513
+ "loss": 0.3263,
514
+ "step": 36000
515
+ },
516
+ {
517
+ "epoch": 1.173105354502796,
518
+ "grad_norm": 0.22479598224163055,
519
+ "learning_rate": 2.0683615092884234e-05,
520
+ "loss": 0.328,
521
+ "step": 36500
522
+ },
523
+ {
524
+ "epoch": 1.1891752908658482,
525
+ "grad_norm": 0.22207121551036835,
526
+ "learning_rate": 2.0282670180626086e-05,
527
+ "loss": 0.3275,
528
+ "step": 37000
529
+ },
530
+ {
531
+ "epoch": 1.2052452272289003,
532
+ "grad_norm": 0.23822110891342163,
533
+ "learning_rate": 1.9880921771549785e-05,
534
+ "loss": 0.3273,
535
+ "step": 37500
536
+ },
537
+ {
538
+ "epoch": 1.2213151635919521,
539
+ "grad_norm": 0.23664866387844086,
540
+ "learning_rate": 1.9479173362473487e-05,
541
+ "loss": 0.318,
542
+ "step": 38000
543
+ },
544
+ {
545
+ "epoch": 1.2373850999550042,
546
+ "grad_norm": 0.18543508648872375,
547
+ "learning_rate": 1.9077424953397185e-05,
548
+ "loss": 0.3235,
549
+ "step": 38500
550
+ },
551
+ {
552
+ "epoch": 1.253455036318056,
553
+ "grad_norm": 0.23305822908878326,
554
+ "learning_rate": 1.8676480041139037e-05,
555
+ "loss": 0.3243,
556
+ "step": 39000
557
+ },
558
+ {
559
+ "epoch": 1.2695249726811082,
560
+ "grad_norm": 0.21699073910713196,
561
+ "learning_rate": 1.827473163206274e-05,
562
+ "loss": 0.3222,
563
+ "step": 39500
564
+ },
565
+ {
566
+ "epoch": 1.28559490904416,
567
+ "grad_norm": 0.2757895588874817,
568
+ "learning_rate": 1.7872983222986438e-05,
569
+ "loss": 0.3248,
570
+ "step": 40000
571
+ },
572
+ {
573
+ "epoch": 1.3016648454072122,
574
+ "grad_norm": 0.19769324362277985,
575
+ "learning_rate": 1.7471234813910137e-05,
576
+ "loss": 0.3179,
577
+ "step": 40500
578
+ },
579
+ {
580
+ "epoch": 1.3177347817702643,
581
+ "grad_norm": 0.18964402377605438,
582
+ "learning_rate": 1.707028990165199e-05,
583
+ "loss": 0.3178,
584
+ "step": 41000
585
+ },
586
+ {
587
+ "epoch": 1.3338047181333161,
588
+ "grad_norm": 0.2584107220172882,
589
+ "learning_rate": 1.666854149257569e-05,
590
+ "loss": 0.318,
591
+ "step": 41500
592
+ },
593
+ {
594
+ "epoch": 1.3498746544963682,
595
+ "grad_norm": 0.25919750332832336,
596
+ "learning_rate": 1.626759658031754e-05,
597
+ "loss": 0.3205,
598
+ "step": 42000
599
+ },
600
+ {
601
+ "epoch": 1.3659445908594203,
602
+ "grad_norm": 0.24371759593486786,
603
+ "learning_rate": 1.5865848171241244e-05,
604
+ "loss": 0.3186,
605
+ "step": 42500
606
+ },
607
+ {
608
+ "epoch": 1.3820145272224722,
609
+ "grad_norm": 0.24457883834838867,
610
+ "learning_rate": 1.5464099762164942e-05,
611
+ "loss": 0.3162,
612
+ "step": 43000
613
+ },
614
+ {
615
+ "epoch": 1.398084463585524,
616
+ "grad_norm": 0.1918337345123291,
617
+ "learning_rate": 1.5062351353088641e-05,
618
+ "loss": 0.3169,
619
+ "step": 43500
620
+ },
621
+ {
622
+ "epoch": 1.4141543999485762,
623
+ "grad_norm": 0.2350657880306244,
624
+ "learning_rate": 1.4660602944012342e-05,
625
+ "loss": 0.3171,
626
+ "step": 44000
627
+ },
628
+ {
629
+ "epoch": 1.4302243363116283,
630
+ "grad_norm": 0.2481279820203781,
631
+ "learning_rate": 1.4258854534936042e-05,
632
+ "loss": 0.3179,
633
+ "step": 44500
634
+ },
635
+ {
636
+ "epoch": 1.4462942726746801,
637
+ "grad_norm": 0.21132701635360718,
638
+ "learning_rate": 1.3857106125859743e-05,
639
+ "loss": 0.3125,
640
+ "step": 45000
641
+ },
642
+ {
643
+ "epoch": 1.4623642090377322,
644
+ "grad_norm": 0.20240716636180878,
645
+ "learning_rate": 1.3455357716783443e-05,
646
+ "loss": 0.3172,
647
+ "step": 45500
648
+ },
649
+ {
650
+ "epoch": 1.4784341454007843,
651
+ "grad_norm": 0.2224823385477066,
652
+ "learning_rate": 1.3054412804525296e-05,
653
+ "loss": 0.3151,
654
+ "step": 46000
655
+ },
656
+ {
657
+ "epoch": 1.4945040817638362,
658
+ "grad_norm": 0.19261781871318817,
659
+ "learning_rate": 1.2652664395448997e-05,
660
+ "loss": 0.312,
661
+ "step": 46500
662
+ },
663
+ {
664
+ "epoch": 1.510574018126888,
665
+ "grad_norm": 0.16068917512893677,
666
+ "learning_rate": 1.2250915986372695e-05,
667
+ "loss": 0.3145,
668
+ "step": 47000
669
+ },
670
+ {
671
+ "epoch": 1.5266439544899402,
672
+ "grad_norm": 0.18192972242832184,
673
+ "learning_rate": 1.1849167577296394e-05,
674
+ "loss": 0.3134,
675
+ "step": 47500
676
+ },
677
+ {
678
+ "epoch": 1.5427138908529923,
679
+ "grad_norm": 0.19884943962097168,
680
+ "learning_rate": 1.1448222665038247e-05,
681
+ "loss": 0.3119,
682
+ "step": 48000
683
+ },
684
+ {
685
+ "epoch": 1.5587838272160441,
686
+ "grad_norm": 0.1883106529712677,
687
+ "learning_rate": 1.1046474255961948e-05,
688
+ "loss": 0.316,
689
+ "step": 48500
690
+ },
691
+ {
692
+ "epoch": 1.5748537635790962,
693
+ "grad_norm": 0.19331087172031403,
694
+ "learning_rate": 1.0644725846885646e-05,
695
+ "loss": 0.3135,
696
+ "step": 49000
697
+ },
698
+ {
699
+ "epoch": 1.5909236999421483,
700
+ "grad_norm": 0.20041531324386597,
701
+ "learning_rate": 1.0242977437809347e-05,
702
+ "loss": 0.3112,
703
+ "step": 49500
704
+ },
705
+ {
706
+ "epoch": 1.6069936363052002,
707
+ "grad_norm": 0.18530187010765076,
708
+ "learning_rate": 9.8420325255512e-06,
709
+ "loss": 0.3122,
710
+ "step": 50000
711
+ },
712
+ {
713
+ "epoch": 1.623063572668252,
714
+ "grad_norm": 0.22725620865821838,
715
+ "learning_rate": 9.4402841164749e-06,
716
+ "loss": 0.3122,
717
+ "step": 50500
718
+ },
719
+ {
720
+ "epoch": 1.6391335090313044,
721
+ "grad_norm": 0.23093479871749878,
722
+ "learning_rate": 9.0385357073986e-06,
723
+ "loss": 0.3149,
724
+ "step": 51000
725
+ },
726
+ {
727
+ "epoch": 1.6552034453943563,
728
+ "grad_norm": 0.19580845534801483,
729
+ "learning_rate": 8.6367872983223e-06,
730
+ "loss": 0.3121,
731
+ "step": 51500
732
+ },
733
+ {
734
+ "epoch": 1.6712733817574081,
735
+ "grad_norm": 0.1742846667766571,
736
+ "learning_rate": 8.235842386064153e-06,
737
+ "loss": 0.3094,
738
+ "step": 52000
739
+ },
740
+ {
741
+ "epoch": 1.6873433181204602,
742
+ "grad_norm": 0.18685191869735718,
743
+ "learning_rate": 7.834093976987852e-06,
744
+ "loss": 0.309,
745
+ "step": 52500
746
+ },
747
+ {
748
+ "epoch": 1.7034132544835123,
749
+ "grad_norm": 0.21959276497364044,
750
+ "learning_rate": 7.432345567911551e-06,
751
+ "loss": 0.3118,
752
+ "step": 53000
753
+ },
754
+ {
755
+ "epoch": 1.7194831908465642,
756
+ "grad_norm": 0.1935770958662033,
757
+ "learning_rate": 7.030597158835252e-06,
758
+ "loss": 0.3106,
759
+ "step": 53500
760
+ },
761
+ {
762
+ "epoch": 1.7355531272096163,
763
+ "grad_norm": 0.19977129995822906,
764
+ "learning_rate": 6.629652246577103e-06,
765
+ "loss": 0.3101,
766
+ "step": 54000
767
+ },
768
+ {
769
+ "epoch": 1.7516230635726684,
770
+ "grad_norm": 0.2006288766860962,
771
+ "learning_rate": 6.2279038375008035e-06,
772
+ "loss": 0.3099,
773
+ "step": 54500
774
+ },
775
+ {
776
+ "epoch": 1.7676929999357203,
777
+ "grad_norm": 0.19280743598937988,
778
+ "learning_rate": 5.826155428424504e-06,
779
+ "loss": 0.308,
780
+ "step": 55000
781
+ },
782
+ {
783
+ "epoch": 1.7837629362987721,
784
+ "grad_norm": 0.22095157206058502,
785
+ "learning_rate": 5.424407019348204e-06,
786
+ "loss": 0.3069,
787
+ "step": 55500
788
+ },
789
+ {
790
+ "epoch": 1.7998328726618242,
791
+ "grad_norm": 0.2091740071773529,
792
+ "learning_rate": 5.022658610271903e-06,
793
+ "loss": 0.3062,
794
+ "step": 56000
795
+ },
796
+ {
797
+ "epoch": 1.8159028090248763,
798
+ "grad_norm": 0.24772244691848755,
799
+ "learning_rate": 4.620910201195604e-06,
800
+ "loss": 0.3093,
801
+ "step": 56500
802
+ },
803
+ {
804
+ "epoch": 1.8319727453879282,
805
+ "grad_norm": 0.1973961740732193,
806
+ "learning_rate": 4.219161792119303e-06,
807
+ "loss": 0.309,
808
+ "step": 57000
809
+ },
810
+ {
811
+ "epoch": 1.8480426817509803,
812
+ "grad_norm": 0.22767914831638336,
813
+ "learning_rate": 3.817413383043003e-06,
814
+ "loss": 0.3109,
815
+ "step": 57500
816
+ },
817
+ {
818
+ "epoch": 1.8641126181140324,
819
+ "grad_norm": 0.21461111307144165,
820
+ "learning_rate": 3.416468470784856e-06,
821
+ "loss": 0.3075,
822
+ "step": 58000
823
+ },
824
+ {
825
+ "epoch": 1.8801825544770843,
826
+ "grad_norm": 0.24607454240322113,
827
+ "learning_rate": 3.0147200617085557e-06,
828
+ "loss": 0.3058,
829
+ "step": 58500
830
+ },
831
+ {
832
+ "epoch": 1.8962524908401361,
833
+ "grad_norm": 0.19667118787765503,
834
+ "learning_rate": 2.6129716526322558e-06,
835
+ "loss": 0.3072,
836
+ "step": 59000
837
+ },
838
+ {
839
+ "epoch": 1.9123224272031882,
840
+ "grad_norm": 0.22604137659072876,
841
+ "learning_rate": 2.211223243555956e-06,
842
+ "loss": 0.3064,
843
+ "step": 59500
844
+ },
845
+ {
846
+ "epoch": 1.9283923635662403,
847
+ "grad_norm": 0.1879967898130417,
848
+ "learning_rate": 1.8102783312978082e-06,
849
+ "loss": 0.3063,
850
+ "step": 60000
851
+ },
852
+ {
853
+ "epoch": 1.9444622999292922,
854
+ "grad_norm": 0.21271295845508575,
855
+ "learning_rate": 1.408529922221508e-06,
856
+ "loss": 0.3076,
857
+ "step": 60500
858
+ },
859
+ {
860
+ "epoch": 1.9605322362923443,
861
+ "grad_norm": 0.16714586317539215,
862
+ "learning_rate": 1.006781513145208e-06,
863
+ "loss": 0.3092,
864
+ "step": 61000
865
+ },
866
+ {
867
+ "epoch": 1.9766021726553964,
868
+ "grad_norm": 0.20666128396987915,
869
+ "learning_rate": 6.050331040689079e-07,
870
+ "loss": 0.3076,
871
+ "step": 61500
872
+ },
873
+ {
874
+ "epoch": 1.9926721090184483,
875
+ "grad_norm": 0.18590718507766724,
876
+ "learning_rate": 2.0328469499260785e-07,
877
+ "loss": 0.3063,
878
+ "step": 62000
879
+ }
880
+ ],
881
+ "logging_steps": 500,
882
+ "max_steps": 62228,
883
+ "num_input_tokens_seen": 0,
884
+ "num_train_epochs": 2,
885
+ "save_steps": 500,
886
+ "stateful_callbacks": {
887
+ "TrainerControl": {
888
+ "args": {
889
+ "should_epoch_stop": false,
890
+ "should_evaluate": false,
891
+ "should_log": false,
892
+ "should_save": true,
893
+ "should_training_stop": false
894
+ },
895
+ "attributes": {}
896
+ }
897
+ },
898
+ "total_flos": 1.3425893171842253e+17,
899
+ "train_batch_size": 32,
900
+ "trial_name": null,
901
+ "trial_params": null
902
+ }
checkpoints/{checkpoint-50500 → checkpoint-62000}/training_args.bin RENAMED
File without changes
checkpoints/{checkpoint-50606 → checkpoint-62228}/config.json RENAMED
File without changes
checkpoints/{checkpoint-50606 → checkpoint-62228}/generation_config.json RENAMED
File without changes
checkpoints/{checkpoint-50606 → checkpoint-62228}/model.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3919310dd9be089ca9fda66a54af866fbbc46bb80168530e49bf0e1ef3679903
3
  size 242041896
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fee19ce79c6f45de80a2e273ede68b16d500dae3a2e3da26235d6b4ebc0f92e
3
  size 242041896
checkpoints/{checkpoint-50606 → checkpoint-62228}/optimizer.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b05e406ed4d4530965b9277e9a1cbc4a70f8cad14e0c8ec341ddc50d09b3410f
3
  size 484163514
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86714a27c46ac133d6d9ea7835d73e013ce66bf9fdd762718e18dc2826d7ca1b
3
  size 484163514
checkpoints/{checkpoint-50000 → checkpoint-62228}/rng_state.pth RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7104b52397c2d22c5f07e2c7c69df2f4e3b16e69b1fa33a7a279b00c24e95fd3
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18a17ab2d678c291632f4e799e0f8e429a6b4beb3bf190d75be1b7df3597fa44
3
  size 14244
checkpoints/{checkpoint-50000 → checkpoint-62228}/scaler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5634f5dc2aea7c0bb616a13f1d822ed45a44941fd4ba164c03b704cdc1334651
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db0587d2c7f25e21ab1d2cf91cf211b7d15b48dab687d8d10f7483541a03adb4
3
  size 988
checkpoints/{checkpoint-50500 → checkpoint-62228}/scheduler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1877bc75530c23e0794a0bdf58244875e1d695a580a8471a1c6ce084c7c131d6
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6668451fe2db52de44fd5918c452bb44aa29396a4e9e2cd5118e290aececb3f1
3
  size 1064
checkpoints/{checkpoint-50606 → checkpoint-62228}/special_tokens_map.json RENAMED
File without changes
checkpoints/{checkpoint-50606 → checkpoint-62228}/spiece.model RENAMED
File without changes
checkpoints/{checkpoint-50606 → checkpoint-62228}/tokenizer.json RENAMED
File without changes
checkpoints/{checkpoint-50606 → checkpoint-62228}/tokenizer_config.json RENAMED
File without changes
checkpoints/checkpoint-62228/trainer_state.json ADDED
@@ -0,0 +1,902 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 500,
7
+ "global_step": 62228,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.016069936363052,
14
+ "grad_norm": 0.3969729542732239,
15
+ "learning_rate": 4.960146557819631e-05,
16
+ "loss": 2.05,
17
+ "step": 500
18
+ },
19
+ {
20
+ "epoch": 0.032139872726104,
21
+ "grad_norm": 0.3822907507419586,
22
+ "learning_rate": 4.919971716912001e-05,
23
+ "loss": 1.1207,
24
+ "step": 1000
25
+ },
26
+ {
27
+ "epoch": 0.04820980908915601,
28
+ "grad_norm": 0.36019280552864075,
29
+ "learning_rate": 4.879796876004371e-05,
30
+ "loss": 0.9225,
31
+ "step": 1500
32
+ },
33
+ {
34
+ "epoch": 0.064279745452208,
35
+ "grad_norm": 0.30364033579826355,
36
+ "learning_rate": 4.8396220350967415e-05,
37
+ "loss": 0.8244,
38
+ "step": 2000
39
+ },
40
+ {
41
+ "epoch": 0.08034968181526002,
42
+ "grad_norm": 0.45634394884109497,
43
+ "learning_rate": 4.799447194189111e-05,
44
+ "loss": 0.7506,
45
+ "step": 2500
46
+ },
47
+ {
48
+ "epoch": 0.09641961817831202,
49
+ "grad_norm": 0.3562425374984741,
50
+ "learning_rate": 4.759272353281481e-05,
51
+ "loss": 0.7012,
52
+ "step": 3000
53
+ },
54
+ {
55
+ "epoch": 0.11248955454136401,
56
+ "grad_norm": 0.33726808428764343,
57
+ "learning_rate": 4.719097512373851e-05,
58
+ "loss": 0.6706,
59
+ "step": 3500
60
+ },
61
+ {
62
+ "epoch": 0.128559490904416,
63
+ "grad_norm": 0.30098849534988403,
64
+ "learning_rate": 4.678922671466221e-05,
65
+ "loss": 0.6308,
66
+ "step": 4000
67
+ },
68
+ {
69
+ "epoch": 0.14462942726746802,
70
+ "grad_norm": 0.29443585872650146,
71
+ "learning_rate": 4.6387478305585915e-05,
72
+ "loss": 0.6141,
73
+ "step": 4500
74
+ },
75
+ {
76
+ "epoch": 0.16069936363052004,
77
+ "grad_norm": 0.25647810101509094,
78
+ "learning_rate": 4.598572989650961e-05,
79
+ "loss": 0.5866,
80
+ "step": 5000
81
+ },
82
+ {
83
+ "epoch": 0.17676929999357202,
84
+ "grad_norm": 0.2516370415687561,
85
+ "learning_rate": 4.558398148743331e-05,
86
+ "loss": 0.5665,
87
+ "step": 5500
88
+ },
89
+ {
90
+ "epoch": 0.19283923635662403,
91
+ "grad_norm": 0.3337278366088867,
92
+ "learning_rate": 4.518223307835701e-05,
93
+ "loss": 0.5427,
94
+ "step": 6000
95
+ },
96
+ {
97
+ "epoch": 0.20890917271967602,
98
+ "grad_norm": 0.2592964470386505,
99
+ "learning_rate": 4.478048466928072e-05,
100
+ "loss": 0.5323,
101
+ "step": 6500
102
+ },
103
+ {
104
+ "epoch": 0.22497910908272803,
105
+ "grad_norm": 0.28550606966018677,
106
+ "learning_rate": 4.437873626020441e-05,
107
+ "loss": 0.5187,
108
+ "step": 7000
109
+ },
110
+ {
111
+ "epoch": 0.24104904544578004,
112
+ "grad_norm": 0.26474013924598694,
113
+ "learning_rate": 4.397698785112811e-05,
114
+ "loss": 0.5058,
115
+ "step": 7500
116
+ },
117
+ {
118
+ "epoch": 0.257118981808832,
119
+ "grad_norm": 0.3018198013305664,
120
+ "learning_rate": 4.3575239442051814e-05,
121
+ "loss": 0.5013,
122
+ "step": 8000
123
+ },
124
+ {
125
+ "epoch": 0.27318891817188407,
126
+ "grad_norm": 0.2628585994243622,
127
+ "learning_rate": 4.317349103297551e-05,
128
+ "loss": 0.4883,
129
+ "step": 8500
130
+ },
131
+ {
132
+ "epoch": 0.28925885453493605,
133
+ "grad_norm": 0.30172979831695557,
134
+ "learning_rate": 4.277174262389921e-05,
135
+ "loss": 0.4795,
136
+ "step": 9000
137
+ },
138
+ {
139
+ "epoch": 0.30532879089798803,
140
+ "grad_norm": 0.25293004512786865,
141
+ "learning_rate": 4.236999421482291e-05,
142
+ "loss": 0.4682,
143
+ "step": 9500
144
+ },
145
+ {
146
+ "epoch": 0.3213987272610401,
147
+ "grad_norm": 0.2726214528083801,
148
+ "learning_rate": 4.196824580574661e-05,
149
+ "loss": 0.4641,
150
+ "step": 10000
151
+ },
152
+ {
153
+ "epoch": 0.33746866362409206,
154
+ "grad_norm": 0.2570224106311798,
155
+ "learning_rate": 4.1566497396670314e-05,
156
+ "loss": 0.4556,
157
+ "step": 10500
158
+ },
159
+ {
160
+ "epoch": 0.35353859998714404,
161
+ "grad_norm": 0.26380738615989685,
162
+ "learning_rate": 4.1164748987594006e-05,
163
+ "loss": 0.449,
164
+ "step": 11000
165
+ },
166
+ {
167
+ "epoch": 0.369608536350196,
168
+ "grad_norm": 0.2555176913738251,
169
+ "learning_rate": 4.076300057851771e-05,
170
+ "loss": 0.4412,
171
+ "step": 11500
172
+ },
173
+ {
174
+ "epoch": 0.38567847271324807,
175
+ "grad_norm": 0.2122594565153122,
176
+ "learning_rate": 4.036125216944141e-05,
177
+ "loss": 0.4365,
178
+ "step": 12000
179
+ },
180
+ {
181
+ "epoch": 0.40174840907630005,
182
+ "grad_norm": 0.2333071529865265,
183
+ "learning_rate": 3.9959503760365116e-05,
184
+ "loss": 0.433,
185
+ "step": 12500
186
+ },
187
+ {
188
+ "epoch": 0.41781834543935203,
189
+ "grad_norm": 0.24873752892017365,
190
+ "learning_rate": 3.955775535128881e-05,
191
+ "loss": 0.4283,
192
+ "step": 13000
193
+ },
194
+ {
195
+ "epoch": 0.4338882818024041,
196
+ "grad_norm": 0.32416871190071106,
197
+ "learning_rate": 3.915600694221251e-05,
198
+ "loss": 0.4218,
199
+ "step": 13500
200
+ },
201
+ {
202
+ "epoch": 0.44995821816545606,
203
+ "grad_norm": 0.23515433073043823,
204
+ "learning_rate": 3.875425853313621e-05,
205
+ "loss": 0.4139,
206
+ "step": 14000
207
+ },
208
+ {
209
+ "epoch": 0.46602815452850804,
210
+ "grad_norm": 0.22002151608467102,
211
+ "learning_rate": 3.8353313620878064e-05,
212
+ "loss": 0.417,
213
+ "step": 14500
214
+ },
215
+ {
216
+ "epoch": 0.4820980908915601,
217
+ "grad_norm": 0.251897931098938,
218
+ "learning_rate": 3.795156521180176e-05,
219
+ "loss": 0.4106,
220
+ "step": 15000
221
+ },
222
+ {
223
+ "epoch": 0.49816802725461207,
224
+ "grad_norm": 0.26212435960769653,
225
+ "learning_rate": 3.754981680272546e-05,
226
+ "loss": 0.4037,
227
+ "step": 15500
228
+ },
229
+ {
230
+ "epoch": 0.514237963617664,
231
+ "grad_norm": 0.2718159258365631,
232
+ "learning_rate": 3.714887189046731e-05,
233
+ "loss": 0.402,
234
+ "step": 16000
235
+ },
236
+ {
237
+ "epoch": 0.530307899980716,
238
+ "grad_norm": 0.23812739551067352,
239
+ "learning_rate": 3.674712348139102e-05,
240
+ "loss": 0.3953,
241
+ "step": 16500
242
+ },
243
+ {
244
+ "epoch": 0.5463778363437681,
245
+ "grad_norm": 0.21076083183288574,
246
+ "learning_rate": 3.634537507231471e-05,
247
+ "loss": 0.3938,
248
+ "step": 17000
249
+ },
250
+ {
251
+ "epoch": 0.5624477727068201,
252
+ "grad_norm": 0.25489869713783264,
253
+ "learning_rate": 3.5943626663238416e-05,
254
+ "loss": 0.3921,
255
+ "step": 17500
256
+ },
257
+ {
258
+ "epoch": 0.5785177090698721,
259
+ "grad_norm": 0.24057357013225555,
260
+ "learning_rate": 3.5541878254162115e-05,
261
+ "loss": 0.3867,
262
+ "step": 18000
263
+ },
264
+ {
265
+ "epoch": 0.5945876454329241,
266
+ "grad_norm": 0.24298915266990662,
267
+ "learning_rate": 3.514012984508582e-05,
268
+ "loss": 0.3868,
269
+ "step": 18500
270
+ },
271
+ {
272
+ "epoch": 0.6106575817959761,
273
+ "grad_norm": 0.2183919996023178,
274
+ "learning_rate": 3.473838143600951e-05,
275
+ "loss": 0.3803,
276
+ "step": 19000
277
+ },
278
+ {
279
+ "epoch": 0.626727518159028,
280
+ "grad_norm": 0.2278251349925995,
281
+ "learning_rate": 3.433663302693321e-05,
282
+ "loss": 0.3775,
283
+ "step": 19500
284
+ },
285
+ {
286
+ "epoch": 0.6427974545220801,
287
+ "grad_norm": 0.240201935172081,
288
+ "learning_rate": 3.393568811467507e-05,
289
+ "loss": 0.3751,
290
+ "step": 20000
291
+ },
292
+ {
293
+ "epoch": 0.6588673908851321,
294
+ "grad_norm": 0.21118561923503876,
295
+ "learning_rate": 3.353393970559877e-05,
296
+ "loss": 0.3742,
297
+ "step": 20500
298
+ },
299
+ {
300
+ "epoch": 0.6749373272481841,
301
+ "grad_norm": 0.22640825808048248,
302
+ "learning_rate": 3.313219129652247e-05,
303
+ "loss": 0.3729,
304
+ "step": 21000
305
+ },
306
+ {
307
+ "epoch": 0.6910072636112361,
308
+ "grad_norm": 0.23105542361736298,
309
+ "learning_rate": 3.2730442887446166e-05,
310
+ "loss": 0.3687,
311
+ "step": 21500
312
+ },
313
+ {
314
+ "epoch": 0.7070771999742881,
315
+ "grad_norm": 0.24791008234024048,
316
+ "learning_rate": 3.2329497975188024e-05,
317
+ "loss": 0.3658,
318
+ "step": 22000
319
+ },
320
+ {
321
+ "epoch": 0.7231471363373401,
322
+ "grad_norm": 0.2497881054878235,
323
+ "learning_rate": 3.1928553062929875e-05,
324
+ "loss": 0.3646,
325
+ "step": 22500
326
+ },
327
+ {
328
+ "epoch": 0.739217072700392,
329
+ "grad_norm": 0.2395261973142624,
330
+ "learning_rate": 3.152680465385357e-05,
331
+ "loss": 0.3655,
332
+ "step": 23000
333
+ },
334
+ {
335
+ "epoch": 0.7552870090634441,
336
+ "grad_norm": 0.21194589138031006,
337
+ "learning_rate": 3.112505624477727e-05,
338
+ "loss": 0.3646,
339
+ "step": 23500
340
+ },
341
+ {
342
+ "epoch": 0.7713569454264961,
343
+ "grad_norm": 0.21682508289813995,
344
+ "learning_rate": 3.072330783570097e-05,
345
+ "loss": 0.3629,
346
+ "step": 24000
347
+ },
348
+ {
349
+ "epoch": 0.7874268817895481,
350
+ "grad_norm": 0.23710566759109497,
351
+ "learning_rate": 3.0321559426624674e-05,
352
+ "loss": 0.3583,
353
+ "step": 24500
354
+ },
355
+ {
356
+ "epoch": 0.8034968181526001,
357
+ "grad_norm": 0.23857219517230988,
358
+ "learning_rate": 2.9919811017548372e-05,
359
+ "loss": 0.3561,
360
+ "step": 25000
361
+ },
362
+ {
363
+ "epoch": 0.8195667545156521,
364
+ "grad_norm": 0.241951584815979,
365
+ "learning_rate": 2.9518062608472075e-05,
366
+ "loss": 0.3537,
367
+ "step": 25500
368
+ },
369
+ {
370
+ "epoch": 0.8356366908787041,
371
+ "grad_norm": 0.275765061378479,
372
+ "learning_rate": 2.9116314199395773e-05,
373
+ "loss": 0.3493,
374
+ "step": 26000
375
+ },
376
+ {
377
+ "epoch": 0.8517066272417562,
378
+ "grad_norm": 0.24757184088230133,
379
+ "learning_rate": 2.871536928713762e-05,
380
+ "loss": 0.3486,
381
+ "step": 26500
382
+ },
383
+ {
384
+ "epoch": 0.8677765636048081,
385
+ "grad_norm": 0.21833688020706177,
386
+ "learning_rate": 2.8313620878061327e-05,
387
+ "loss": 0.3461,
388
+ "step": 27000
389
+ },
390
+ {
391
+ "epoch": 0.8838464999678601,
392
+ "grad_norm": 0.21623168885707855,
393
+ "learning_rate": 2.7911872468985022e-05,
394
+ "loss": 0.3468,
395
+ "step": 27500
396
+ },
397
+ {
398
+ "epoch": 0.8999164363309121,
399
+ "grad_norm": 0.20861521363258362,
400
+ "learning_rate": 2.7510124059908728e-05,
401
+ "loss": 0.3481,
402
+ "step": 28000
403
+ },
404
+ {
405
+ "epoch": 0.9159863726939641,
406
+ "grad_norm": 0.20291315019130707,
407
+ "learning_rate": 2.7108375650832423e-05,
408
+ "loss": 0.3474,
409
+ "step": 28500
410
+ },
411
+ {
412
+ "epoch": 0.9320563090570161,
413
+ "grad_norm": 0.2101660966873169,
414
+ "learning_rate": 2.6707430738574275e-05,
415
+ "loss": 0.3412,
416
+ "step": 29000
417
+ },
418
+ {
419
+ "epoch": 0.9481262454200682,
420
+ "grad_norm": 0.23224739730358124,
421
+ "learning_rate": 2.6305682329497977e-05,
422
+ "loss": 0.3422,
423
+ "step": 29500
424
+ },
425
+ {
426
+ "epoch": 0.9641961817831202,
427
+ "grad_norm": 0.22987599670886993,
428
+ "learning_rate": 2.5903933920421676e-05,
429
+ "loss": 0.3407,
430
+ "step": 30000
431
+ },
432
+ {
433
+ "epoch": 0.9802661181461721,
434
+ "grad_norm": 0.22307533025741577,
435
+ "learning_rate": 2.5502185511345378e-05,
436
+ "loss": 0.3365,
437
+ "step": 30500
438
+ },
439
+ {
440
+ "epoch": 0.9963360545092241,
441
+ "grad_norm": 0.20577801764011383,
442
+ "learning_rate": 2.510124059908723e-05,
443
+ "loss": 0.3409,
444
+ "step": 31000
445
+ },
446
+ {
447
+ "epoch": 1.0124059908722762,
448
+ "grad_norm": 0.23968417942523956,
449
+ "learning_rate": 2.4699492190010928e-05,
450
+ "loss": 0.339,
451
+ "step": 31500
452
+ },
453
+ {
454
+ "epoch": 1.028475927235328,
455
+ "grad_norm": 0.2166174054145813,
456
+ "learning_rate": 2.429774378093463e-05,
457
+ "loss": 0.3317,
458
+ "step": 32000
459
+ },
460
+ {
461
+ "epoch": 1.0445458635983802,
462
+ "grad_norm": 0.22259151935577393,
463
+ "learning_rate": 2.389599537185833e-05,
464
+ "loss": 0.3404,
465
+ "step": 32500
466
+ },
467
+ {
468
+ "epoch": 1.060615799961432,
469
+ "grad_norm": 0.2585219442844391,
470
+ "learning_rate": 2.3495050459600184e-05,
471
+ "loss": 0.3322,
472
+ "step": 33000
473
+ },
474
+ {
475
+ "epoch": 1.0766857363244842,
476
+ "grad_norm": 0.23949937522411346,
477
+ "learning_rate": 2.3093302050523882e-05,
478
+ "loss": 0.3332,
479
+ "step": 33500
480
+ },
481
+ {
482
+ "epoch": 1.0927556726875363,
483
+ "grad_norm": 0.2360944151878357,
484
+ "learning_rate": 2.269155364144758e-05,
485
+ "loss": 0.3374,
486
+ "step": 34000
487
+ },
488
+ {
489
+ "epoch": 1.1088256090505881,
490
+ "grad_norm": 0.23383018374443054,
491
+ "learning_rate": 2.228980523237128e-05,
492
+ "loss": 0.3287,
493
+ "step": 34500
494
+ },
495
+ {
496
+ "epoch": 1.1248955454136402,
497
+ "grad_norm": 0.25602060556411743,
498
+ "learning_rate": 2.1888860320113135e-05,
499
+ "loss": 0.3262,
500
+ "step": 35000
501
+ },
502
+ {
503
+ "epoch": 1.140965481776692,
504
+ "grad_norm": 0.2233658730983734,
505
+ "learning_rate": 2.1487111911036833e-05,
506
+ "loss": 0.3294,
507
+ "step": 35500
508
+ },
509
+ {
510
+ "epoch": 1.1570354181397442,
511
+ "grad_norm": 0.23545712232589722,
512
+ "learning_rate": 2.1085363501960532e-05,
513
+ "loss": 0.3263,
514
+ "step": 36000
515
+ },
516
+ {
517
+ "epoch": 1.173105354502796,
518
+ "grad_norm": 0.22479598224163055,
519
+ "learning_rate": 2.0683615092884234e-05,
520
+ "loss": 0.328,
521
+ "step": 36500
522
+ },
523
+ {
524
+ "epoch": 1.1891752908658482,
525
+ "grad_norm": 0.22207121551036835,
526
+ "learning_rate": 2.0282670180626086e-05,
527
+ "loss": 0.3275,
528
+ "step": 37000
529
+ },
530
+ {
531
+ "epoch": 1.2052452272289003,
532
+ "grad_norm": 0.23822110891342163,
533
+ "learning_rate": 1.9880921771549785e-05,
534
+ "loss": 0.3273,
535
+ "step": 37500
536
+ },
537
+ {
538
+ "epoch": 1.2213151635919521,
539
+ "grad_norm": 0.23664866387844086,
540
+ "learning_rate": 1.9479173362473487e-05,
541
+ "loss": 0.318,
542
+ "step": 38000
543
+ },
544
+ {
545
+ "epoch": 1.2373850999550042,
546
+ "grad_norm": 0.18543508648872375,
547
+ "learning_rate": 1.9077424953397185e-05,
548
+ "loss": 0.3235,
549
+ "step": 38500
550
+ },
551
+ {
552
+ "epoch": 1.253455036318056,
553
+ "grad_norm": 0.23305822908878326,
554
+ "learning_rate": 1.8676480041139037e-05,
555
+ "loss": 0.3243,
556
+ "step": 39000
557
+ },
558
+ {
559
+ "epoch": 1.2695249726811082,
560
+ "grad_norm": 0.21699073910713196,
561
+ "learning_rate": 1.827473163206274e-05,
562
+ "loss": 0.3222,
563
+ "step": 39500
564
+ },
565
+ {
566
+ "epoch": 1.28559490904416,
567
+ "grad_norm": 0.2757895588874817,
568
+ "learning_rate": 1.7872983222986438e-05,
569
+ "loss": 0.3248,
570
+ "step": 40000
571
+ },
572
+ {
573
+ "epoch": 1.3016648454072122,
574
+ "grad_norm": 0.19769324362277985,
575
+ "learning_rate": 1.7471234813910137e-05,
576
+ "loss": 0.3179,
577
+ "step": 40500
578
+ },
579
+ {
580
+ "epoch": 1.3177347817702643,
581
+ "grad_norm": 0.18964402377605438,
582
+ "learning_rate": 1.707028990165199e-05,
583
+ "loss": 0.3178,
584
+ "step": 41000
585
+ },
586
+ {
587
+ "epoch": 1.3338047181333161,
588
+ "grad_norm": 0.2584107220172882,
589
+ "learning_rate": 1.666854149257569e-05,
590
+ "loss": 0.318,
591
+ "step": 41500
592
+ },
593
+ {
594
+ "epoch": 1.3498746544963682,
595
+ "grad_norm": 0.25919750332832336,
596
+ "learning_rate": 1.626759658031754e-05,
597
+ "loss": 0.3205,
598
+ "step": 42000
599
+ },
600
+ {
601
+ "epoch": 1.3659445908594203,
602
+ "grad_norm": 0.24371759593486786,
603
+ "learning_rate": 1.5865848171241244e-05,
604
+ "loss": 0.3186,
605
+ "step": 42500
606
+ },
607
+ {
608
+ "epoch": 1.3820145272224722,
609
+ "grad_norm": 0.24457883834838867,
610
+ "learning_rate": 1.5464099762164942e-05,
611
+ "loss": 0.3162,
612
+ "step": 43000
613
+ },
614
+ {
615
+ "epoch": 1.398084463585524,
616
+ "grad_norm": 0.1918337345123291,
617
+ "learning_rate": 1.5062351353088641e-05,
618
+ "loss": 0.3169,
619
+ "step": 43500
620
+ },
621
+ {
622
+ "epoch": 1.4141543999485762,
623
+ "grad_norm": 0.2350657880306244,
624
+ "learning_rate": 1.4660602944012342e-05,
625
+ "loss": 0.3171,
626
+ "step": 44000
627
+ },
628
+ {
629
+ "epoch": 1.4302243363116283,
630
+ "grad_norm": 0.2481279820203781,
631
+ "learning_rate": 1.4258854534936042e-05,
632
+ "loss": 0.3179,
633
+ "step": 44500
634
+ },
635
+ {
636
+ "epoch": 1.4462942726746801,
637
+ "grad_norm": 0.21132701635360718,
638
+ "learning_rate": 1.3857106125859743e-05,
639
+ "loss": 0.3125,
640
+ "step": 45000
641
+ },
642
+ {
643
+ "epoch": 1.4623642090377322,
644
+ "grad_norm": 0.20240716636180878,
645
+ "learning_rate": 1.3455357716783443e-05,
646
+ "loss": 0.3172,
647
+ "step": 45500
648
+ },
649
+ {
650
+ "epoch": 1.4784341454007843,
651
+ "grad_norm": 0.2224823385477066,
652
+ "learning_rate": 1.3054412804525296e-05,
653
+ "loss": 0.3151,
654
+ "step": 46000
655
+ },
656
+ {
657
+ "epoch": 1.4945040817638362,
658
+ "grad_norm": 0.19261781871318817,
659
+ "learning_rate": 1.2652664395448997e-05,
660
+ "loss": 0.312,
661
+ "step": 46500
662
+ },
663
+ {
664
+ "epoch": 1.510574018126888,
665
+ "grad_norm": 0.16068917512893677,
666
+ "learning_rate": 1.2250915986372695e-05,
667
+ "loss": 0.3145,
668
+ "step": 47000
669
+ },
670
+ {
671
+ "epoch": 1.5266439544899402,
672
+ "grad_norm": 0.18192972242832184,
673
+ "learning_rate": 1.1849167577296394e-05,
674
+ "loss": 0.3134,
675
+ "step": 47500
676
+ },
677
+ {
678
+ "epoch": 1.5427138908529923,
679
+ "grad_norm": 0.19884943962097168,
680
+ "learning_rate": 1.1448222665038247e-05,
681
+ "loss": 0.3119,
682
+ "step": 48000
683
+ },
684
+ {
685
+ "epoch": 1.5587838272160441,
686
+ "grad_norm": 0.1883106529712677,
687
+ "learning_rate": 1.1046474255961948e-05,
688
+ "loss": 0.316,
689
+ "step": 48500
690
+ },
691
+ {
692
+ "epoch": 1.5748537635790962,
693
+ "grad_norm": 0.19331087172031403,
694
+ "learning_rate": 1.0644725846885646e-05,
695
+ "loss": 0.3135,
696
+ "step": 49000
697
+ },
698
+ {
699
+ "epoch": 1.5909236999421483,
700
+ "grad_norm": 0.20041531324386597,
701
+ "learning_rate": 1.0242977437809347e-05,
702
+ "loss": 0.3112,
703
+ "step": 49500
704
+ },
705
+ {
706
+ "epoch": 1.6069936363052002,
707
+ "grad_norm": 0.18530187010765076,
708
+ "learning_rate": 9.8420325255512e-06,
709
+ "loss": 0.3122,
710
+ "step": 50000
711
+ },
712
+ {
713
+ "epoch": 1.623063572668252,
714
+ "grad_norm": 0.22725620865821838,
715
+ "learning_rate": 9.4402841164749e-06,
716
+ "loss": 0.3122,
717
+ "step": 50500
718
+ },
719
+ {
720
+ "epoch": 1.6391335090313044,
721
+ "grad_norm": 0.23093479871749878,
722
+ "learning_rate": 9.0385357073986e-06,
723
+ "loss": 0.3149,
724
+ "step": 51000
725
+ },
726
+ {
727
+ "epoch": 1.6552034453943563,
728
+ "grad_norm": 0.19580845534801483,
729
+ "learning_rate": 8.6367872983223e-06,
730
+ "loss": 0.3121,
731
+ "step": 51500
732
+ },
733
+ {
734
+ "epoch": 1.6712733817574081,
735
+ "grad_norm": 0.1742846667766571,
736
+ "learning_rate": 8.235842386064153e-06,
737
+ "loss": 0.3094,
738
+ "step": 52000
739
+ },
740
+ {
741
+ "epoch": 1.6873433181204602,
742
+ "grad_norm": 0.18685191869735718,
743
+ "learning_rate": 7.834093976987852e-06,
744
+ "loss": 0.309,
745
+ "step": 52500
746
+ },
747
+ {
748
+ "epoch": 1.7034132544835123,
749
+ "grad_norm": 0.21959276497364044,
750
+ "learning_rate": 7.432345567911551e-06,
751
+ "loss": 0.3118,
752
+ "step": 53000
753
+ },
754
+ {
755
+ "epoch": 1.7194831908465642,
756
+ "grad_norm": 0.1935770958662033,
757
+ "learning_rate": 7.030597158835252e-06,
758
+ "loss": 0.3106,
759
+ "step": 53500
760
+ },
761
+ {
762
+ "epoch": 1.7355531272096163,
763
+ "grad_norm": 0.19977129995822906,
764
+ "learning_rate": 6.629652246577103e-06,
765
+ "loss": 0.3101,
766
+ "step": 54000
767
+ },
768
+ {
769
+ "epoch": 1.7516230635726684,
770
+ "grad_norm": 0.2006288766860962,
771
+ "learning_rate": 6.2279038375008035e-06,
772
+ "loss": 0.3099,
773
+ "step": 54500
774
+ },
775
+ {
776
+ "epoch": 1.7676929999357203,
777
+ "grad_norm": 0.19280743598937988,
778
+ "learning_rate": 5.826155428424504e-06,
779
+ "loss": 0.308,
780
+ "step": 55000
781
+ },
782
+ {
783
+ "epoch": 1.7837629362987721,
784
+ "grad_norm": 0.22095157206058502,
785
+ "learning_rate": 5.424407019348204e-06,
786
+ "loss": 0.3069,
787
+ "step": 55500
788
+ },
789
+ {
790
+ "epoch": 1.7998328726618242,
791
+ "grad_norm": 0.2091740071773529,
792
+ "learning_rate": 5.022658610271903e-06,
793
+ "loss": 0.3062,
794
+ "step": 56000
795
+ },
796
+ {
797
+ "epoch": 1.8159028090248763,
798
+ "grad_norm": 0.24772244691848755,
799
+ "learning_rate": 4.620910201195604e-06,
800
+ "loss": 0.3093,
801
+ "step": 56500
802
+ },
803
+ {
804
+ "epoch": 1.8319727453879282,
805
+ "grad_norm": 0.1973961740732193,
806
+ "learning_rate": 4.219161792119303e-06,
807
+ "loss": 0.309,
808
+ "step": 57000
809
+ },
810
+ {
811
+ "epoch": 1.8480426817509803,
812
+ "grad_norm": 0.22767914831638336,
813
+ "learning_rate": 3.817413383043003e-06,
814
+ "loss": 0.3109,
815
+ "step": 57500
816
+ },
817
+ {
818
+ "epoch": 1.8641126181140324,
819
+ "grad_norm": 0.21461111307144165,
820
+ "learning_rate": 3.416468470784856e-06,
821
+ "loss": 0.3075,
822
+ "step": 58000
823
+ },
824
+ {
825
+ "epoch": 1.8801825544770843,
826
+ "grad_norm": 0.24607454240322113,
827
+ "learning_rate": 3.0147200617085557e-06,
828
+ "loss": 0.3058,
829
+ "step": 58500
830
+ },
831
+ {
832
+ "epoch": 1.8962524908401361,
833
+ "grad_norm": 0.19667118787765503,
834
+ "learning_rate": 2.6129716526322558e-06,
835
+ "loss": 0.3072,
836
+ "step": 59000
837
+ },
838
+ {
839
+ "epoch": 1.9123224272031882,
840
+ "grad_norm": 0.22604137659072876,
841
+ "learning_rate": 2.211223243555956e-06,
842
+ "loss": 0.3064,
843
+ "step": 59500
844
+ },
845
+ {
846
+ "epoch": 1.9283923635662403,
847
+ "grad_norm": 0.1879967898130417,
848
+ "learning_rate": 1.8102783312978082e-06,
849
+ "loss": 0.3063,
850
+ "step": 60000
851
+ },
852
+ {
853
+ "epoch": 1.9444622999292922,
854
+ "grad_norm": 0.21271295845508575,
855
+ "learning_rate": 1.408529922221508e-06,
856
+ "loss": 0.3076,
857
+ "step": 60500
858
+ },
859
+ {
860
+ "epoch": 1.9605322362923443,
861
+ "grad_norm": 0.16714586317539215,
862
+ "learning_rate": 1.006781513145208e-06,
863
+ "loss": 0.3092,
864
+ "step": 61000
865
+ },
866
+ {
867
+ "epoch": 1.9766021726553964,
868
+ "grad_norm": 0.20666128396987915,
869
+ "learning_rate": 6.050331040689079e-07,
870
+ "loss": 0.3076,
871
+ "step": 61500
872
+ },
873
+ {
874
+ "epoch": 1.9926721090184483,
875
+ "grad_norm": 0.18590718507766724,
876
+ "learning_rate": 2.0328469499260785e-07,
877
+ "loss": 0.3063,
878
+ "step": 62000
879
+ }
880
+ ],
881
+ "logging_steps": 500,
882
+ "max_steps": 62228,
883
+ "num_input_tokens_seen": 0,
884
+ "num_train_epochs": 2,
885
+ "save_steps": 500,
886
+ "stateful_callbacks": {
887
+ "TrainerControl": {
888
+ "args": {
889
+ "should_epoch_stop": false,
890
+ "should_evaluate": false,
891
+ "should_log": false,
892
+ "should_save": true,
893
+ "should_training_stop": true
894
+ },
895
+ "attributes": {}
896
+ }
897
+ },
898
+ "total_flos": 1.3475252326839091e+17,
899
+ "train_batch_size": 32,
900
+ "trial_name": null,
901
+ "trial_params": null
902
+ }
checkpoints/{checkpoint-50606 → checkpoint-62228}/training_args.bin RENAMED
File without changes
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3919310dd9be089ca9fda66a54af866fbbc46bb80168530e49bf0e1ef3679903
3
  size 242041896
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fee19ce79c6f45de80a2e273ede68b16d500dae3a2e3da26235d6b4ebc0f92e
3
  size 242041896
src/data/__pycache__/generate_cyr_lat_pairs.cpython-312.pyc ADDED
Binary file (6.54 kB). View file