Stuti103 commited on
Commit
4fd4aff
·
verified ·
1 Parent(s): 7117d4a

Training in progress, step 13800, checkpoint

Browse files
.gitattributes CHANGED
@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
  checkpoint-13800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
  checkpoint-13800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ last-checkpoint/tokenizer.json filter=lfs diff=lfs merge=lfs -text
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:28e221fe135396abdaf55d2a95d207fca21b83fb988a190060f0484696c218f4
3
  size 3541119728
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0a97ad14fc05d45b0bcdd00fe3398b94c48a3fba262343ce5b929ae2698e50d
3
  size 3541119728
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3ac0d16d2e2fc9dc907a07cfd5c8b852be59d3f0f52d6eb5733f16c2c8a60b45
3
  size 778374186
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:daa42de180142ef371b2befa10b05a6c991d650216eca013dca907dc8c2a9a76
3
  size 778374186
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:70bd8ff573da934b3deb2e9a2a6d5458d0bae248e930b9d724ab763e81dd21a3
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d529b3ead19a0f7e903fee329286bdbb85e6ac6fdf18146d635bf8003ed8ece
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.6442453381691502,
6
  "eval_steps": 500,
7
- "global_step": 13500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -12158,6 +12158,276 @@
12158
  "mean_token_accuracy": 0.8752416774630547,
12159
  "num_tokens": 22423922.0,
12160
  "step": 13500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12161
  }
12162
  ],
12163
  "logging_steps": 10,
@@ -12177,7 +12447,7 @@
12177
  "attributes": {}
12178
  }
12179
  },
12180
- "total_flos": 5.048330108480225e+17,
12181
  "train_batch_size": 2,
12182
  "trial_name": null,
12183
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.6585619012395757,
6
  "eval_steps": 500,
7
+ "global_step": 13800,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
12158
  "mean_token_accuracy": 0.8752416774630547,
12159
  "num_tokens": 22423922.0,
12160
  "step": 13500
12161
+ },
12162
+ {
12163
+ "epoch": 0.6447225569381644,
12164
+ "grad_norm": 0.4227236807346344,
12165
+ "learning_rate": 1.3553328561202578e-05,
12166
+ "loss": 0.7471,
12167
+ "mean_token_accuracy": 0.8619314864277839,
12168
+ "num_tokens": 22442654.0,
12169
+ "step": 13510
12170
+ },
12171
+ {
12172
+ "epoch": 0.6451997757071786,
12173
+ "grad_norm": 0.4778737425804138,
12174
+ "learning_rate": 1.3548556430446197e-05,
12175
+ "loss": 0.6241,
12176
+ "mean_token_accuracy": 0.8725541770458222,
12177
+ "num_tokens": 22458565.0,
12178
+ "step": 13520
12179
+ },
12180
+ {
12181
+ "epoch": 0.6456769944761928,
12182
+ "grad_norm": 0.3292141854763031,
12183
+ "learning_rate": 1.3543784299689813e-05,
12184
+ "loss": 0.6019,
12185
+ "mean_token_accuracy": 0.8786483362317086,
12186
+ "num_tokens": 22475131.0,
12187
+ "step": 13530
12188
+ },
12189
+ {
12190
+ "epoch": 0.6461542132452069,
12191
+ "grad_norm": 0.3959347605705261,
12192
+ "learning_rate": 1.353901216893343e-05,
12193
+ "loss": 0.5363,
12194
+ "mean_token_accuracy": 0.8921607866883278,
12195
+ "num_tokens": 22490344.0,
12196
+ "step": 13540
12197
+ },
12198
+ {
12199
+ "epoch": 0.6466314320142211,
12200
+ "grad_norm": 0.3481472134590149,
12201
+ "learning_rate": 1.3534240038177047e-05,
12202
+ "loss": 0.608,
12203
+ "mean_token_accuracy": 0.8803831622004509,
12204
+ "num_tokens": 22505993.0,
12205
+ "step": 13550
12206
+ },
12207
+ {
12208
+ "epoch": 0.6471086507832353,
12209
+ "grad_norm": 0.3353317081928253,
12210
+ "learning_rate": 1.3529467907420663e-05,
12211
+ "loss": 0.6797,
12212
+ "mean_token_accuracy": 0.8620204761624336,
12213
+ "num_tokens": 22523092.0,
12214
+ "step": 13560
12215
+ },
12216
+ {
12217
+ "epoch": 0.6475858695522495,
12218
+ "grad_norm": 0.33590102195739746,
12219
+ "learning_rate": 1.3524695776664283e-05,
12220
+ "loss": 0.6826,
12221
+ "mean_token_accuracy": 0.8625424951314926,
12222
+ "num_tokens": 22540022.0,
12223
+ "step": 13570
12224
+ },
12225
+ {
12226
+ "epoch": 0.6480630883212637,
12227
+ "grad_norm": 0.3883362412452698,
12228
+ "learning_rate": 1.35199236459079e-05,
12229
+ "loss": 0.7065,
12230
+ "mean_token_accuracy": 0.8587613448500633,
12231
+ "num_tokens": 22557276.0,
12232
+ "step": 13580
12233
+ },
12234
+ {
12235
+ "epoch": 0.6485403070902779,
12236
+ "grad_norm": 0.329208642244339,
12237
+ "learning_rate": 1.3515151515151517e-05,
12238
+ "loss": 0.6851,
12239
+ "mean_token_accuracy": 0.8579544991254806,
12240
+ "num_tokens": 22575264.0,
12241
+ "step": 13590
12242
+ },
12243
+ {
12244
+ "epoch": 0.6490175258592921,
12245
+ "grad_norm": 0.3257433772087097,
12246
+ "learning_rate": 1.3510379384395133e-05,
12247
+ "loss": 0.6501,
12248
+ "mean_token_accuracy": 0.8753771096467972,
12249
+ "num_tokens": 22592298.0,
12250
+ "step": 13600
12251
+ },
12252
+ {
12253
+ "epoch": 0.6494947446283063,
12254
+ "grad_norm": 0.319042444229126,
12255
+ "learning_rate": 1.350560725363875e-05,
12256
+ "loss": 0.6453,
12257
+ "mean_token_accuracy": 0.8721793726086616,
12258
+ "num_tokens": 22609547.0,
12259
+ "step": 13610
12260
+ },
12261
+ {
12262
+ "epoch": 0.6499719633973204,
12263
+ "grad_norm": 0.34079188108444214,
12264
+ "learning_rate": 1.3500835122882368e-05,
12265
+ "loss": 0.7824,
12266
+ "mean_token_accuracy": 0.8555491074919701,
12267
+ "num_tokens": 22628228.0,
12268
+ "step": 13620
12269
+ },
12270
+ {
12271
+ "epoch": 0.6504491821663346,
12272
+ "grad_norm": 0.45218825340270996,
12273
+ "learning_rate": 1.3496062992125985e-05,
12274
+ "loss": 0.634,
12275
+ "mean_token_accuracy": 0.8678511619567871,
12276
+ "num_tokens": 22643854.0,
12277
+ "step": 13630
12278
+ },
12279
+ {
12280
+ "epoch": 0.6509264009353488,
12281
+ "grad_norm": 0.2865401804447174,
12282
+ "learning_rate": 1.3491290861369603e-05,
12283
+ "loss": 0.6747,
12284
+ "mean_token_accuracy": 0.8544631570577621,
12285
+ "num_tokens": 22661164.0,
12286
+ "step": 13640
12287
+ },
12288
+ {
12289
+ "epoch": 0.651403619704363,
12290
+ "grad_norm": 0.4217221736907959,
12291
+ "learning_rate": 1.348651873061322e-05,
12292
+ "loss": 0.5285,
12293
+ "mean_token_accuracy": 0.8921225979924202,
12294
+ "num_tokens": 22676050.0,
12295
+ "step": 13650
12296
+ },
12297
+ {
12298
+ "epoch": 0.6518808384733772,
12299
+ "grad_norm": 0.4127669334411621,
12300
+ "learning_rate": 1.3481746599856837e-05,
12301
+ "loss": 0.5793,
12302
+ "mean_token_accuracy": 0.886702474951744,
12303
+ "num_tokens": 22691960.0,
12304
+ "step": 13660
12305
+ },
12306
+ {
12307
+ "epoch": 0.6523580572423914,
12308
+ "grad_norm": 0.3422595262527466,
12309
+ "learning_rate": 1.3476974469100455e-05,
12310
+ "loss": 0.6475,
12311
+ "mean_token_accuracy": 0.8713901385664939,
12312
+ "num_tokens": 22707891.0,
12313
+ "step": 13670
12314
+ },
12315
+ {
12316
+ "epoch": 0.6528352760114056,
12317
+ "grad_norm": 0.4279707372188568,
12318
+ "learning_rate": 1.3472202338344072e-05,
12319
+ "loss": 0.7561,
12320
+ "mean_token_accuracy": 0.8528080299496651,
12321
+ "num_tokens": 22726155.0,
12322
+ "step": 13680
12323
+ },
12324
+ {
12325
+ "epoch": 0.6533124947804198,
12326
+ "grad_norm": 0.3606453239917755,
12327
+ "learning_rate": 1.3467430207587688e-05,
12328
+ "loss": 0.7429,
12329
+ "mean_token_accuracy": 0.864837720990181,
12330
+ "num_tokens": 22744235.0,
12331
+ "step": 13690
12332
+ },
12333
+ {
12334
+ "epoch": 0.6537897135494339,
12335
+ "grad_norm": 0.38309189677238464,
12336
+ "learning_rate": 1.3462658076831305e-05,
12337
+ "loss": 0.6402,
12338
+ "mean_token_accuracy": 0.8674290254712105,
12339
+ "num_tokens": 22760300.0,
12340
+ "step": 13700
12341
+ },
12342
+ {
12343
+ "epoch": 0.6542669323184481,
12344
+ "grad_norm": 0.30889174342155457,
12345
+ "learning_rate": 1.3457885946074925e-05,
12346
+ "loss": 0.7359,
12347
+ "mean_token_accuracy": 0.8531943425536156,
12348
+ "num_tokens": 22778164.0,
12349
+ "step": 13710
12350
+ },
12351
+ {
12352
+ "epoch": 0.6547441510874623,
12353
+ "grad_norm": 0.3210035264492035,
12354
+ "learning_rate": 1.3453113815318542e-05,
12355
+ "loss": 0.5691,
12356
+ "mean_token_accuracy": 0.892583754658699,
12357
+ "num_tokens": 22792570.0,
12358
+ "step": 13720
12359
+ },
12360
+ {
12361
+ "epoch": 0.6552213698564765,
12362
+ "grad_norm": 0.2989923357963562,
12363
+ "learning_rate": 1.3448341684562158e-05,
12364
+ "loss": 0.5318,
12365
+ "mean_token_accuracy": 0.8902983129024505,
12366
+ "num_tokens": 22807960.0,
12367
+ "step": 13730
12368
+ },
12369
+ {
12370
+ "epoch": 0.6556985886254907,
12371
+ "grad_norm": 0.381619393825531,
12372
+ "learning_rate": 1.3443569553805775e-05,
12373
+ "loss": 0.717,
12374
+ "mean_token_accuracy": 0.8556675240397453,
12375
+ "num_tokens": 22826319.0,
12376
+ "step": 13740
12377
+ },
12378
+ {
12379
+ "epoch": 0.6561758073945049,
12380
+ "grad_norm": 0.33662042021751404,
12381
+ "learning_rate": 1.3438797423049392e-05,
12382
+ "loss": 0.6839,
12383
+ "mean_token_accuracy": 0.8628215402364731,
12384
+ "num_tokens": 22843310.0,
12385
+ "step": 13750
12386
+ },
12387
+ {
12388
+ "epoch": 0.6566530261635191,
12389
+ "grad_norm": 0.30493494868278503,
12390
+ "learning_rate": 1.3434025292293008e-05,
12391
+ "loss": 0.5978,
12392
+ "mean_token_accuracy": 0.8812342941761017,
12393
+ "num_tokens": 22858101.0,
12394
+ "step": 13760
12395
+ },
12396
+ {
12397
+ "epoch": 0.6571302449325332,
12398
+ "grad_norm": 0.4126700460910797,
12399
+ "learning_rate": 1.3429253161536627e-05,
12400
+ "loss": 0.652,
12401
+ "mean_token_accuracy": 0.8751346081495285,
12402
+ "num_tokens": 22874216.0,
12403
+ "step": 13770
12404
+ },
12405
+ {
12406
+ "epoch": 0.6576074637015474,
12407
+ "grad_norm": 0.3574364185333252,
12408
+ "learning_rate": 1.3424481030780245e-05,
12409
+ "loss": 0.6295,
12410
+ "mean_token_accuracy": 0.8706316411495209,
12411
+ "num_tokens": 22889521.0,
12412
+ "step": 13780
12413
+ },
12414
+ {
12415
+ "epoch": 0.6580846824705616,
12416
+ "grad_norm": 0.4885793924331665,
12417
+ "learning_rate": 1.3419708900023862e-05,
12418
+ "loss": 0.5987,
12419
+ "mean_token_accuracy": 0.8812028467655182,
12420
+ "num_tokens": 22907349.0,
12421
+ "step": 13790
12422
+ },
12423
+ {
12424
+ "epoch": 0.6585619012395757,
12425
+ "grad_norm": 0.33491700887680054,
12426
+ "learning_rate": 1.3414936769267479e-05,
12427
+ "loss": 0.6659,
12428
+ "mean_token_accuracy": 0.8643362104892731,
12429
+ "num_tokens": 22924082.0,
12430
+ "step": 13800
12431
  }
12432
  ],
12433
  "logging_steps": 10,
 
12447
  "attributes": {}
12448
  }
12449
  },
12450
+ "total_flos": 5.161398467590963e+17,
12451
  "train_batch_size": 2,
12452
  "trial_name": null,
12453
  "trial_params": null