Pranay17 commited on
Commit
a07764c
·
verified ·
1 Parent(s): cb0bdf2

Training in progress, step 140000, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d3181ba5c18e42ea6a2e47ced69a1726cab9aedbdb76f8d38d78b6f84e74b0eb
3
  size 42002584
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7be691eaf282bf25064ee3473aaeb025e9deec8018cf1bbb96477efef4e9f02
3
  size 42002584
last-checkpoint/global_step140000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c197028331b969712006da6a095d71f916f44905e7b83b2ecb270a4fe4656482
3
+ size 251710672
last-checkpoint/global_step140000/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79dc08cbd56da6ba5c99743c1aad9b3100799c6bd1e0fcbaf797d35378ac156f
3
+ size 153747385
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step130000
 
1
+ global_step140000
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9a6411ad924de80114eb71f3836dc652ae4d7de32682ad3c82b390b42d21c9d5
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86c469f6100d588a6046fb5482072daba9d88dc1761bd5af2080993ae538413c
3
  size 14244
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.881408055414564,
5
  "eval_steps": 1000,
6
- "global_step": 130000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -18214,6 +18214,1406 @@
18214
  "learning_rate": 0.00016285864490654233,
18215
  "loss": 1.2988,
18216
  "step": 130000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18217
  }
18218
  ],
18219
  "logging_steps": 50,
@@ -18233,7 +19633,7 @@
18233
  "attributes": {}
18234
  }
18235
  },
18236
- "total_flos": 3.2834368542993285e+18,
18237
  "train_batch_size": 2,
18238
  "trial_name": null,
18239
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 4.179977905831069,
5
  "eval_steps": 1000,
6
+ "global_step": 140000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
18214
  "learning_rate": 0.00016285864490654233,
18215
  "loss": 1.2988,
18216
  "step": 130000
18217
+ },
18218
+ {
18219
+ "epoch": 3.8829009046666467,
18220
+ "grad_norm": 3.8731918334960938,
18221
+ "learning_rate": 0.0001628443591106235,
18222
+ "loss": 1.2158,
18223
+ "step": 130050
18224
+ },
18225
+ {
18226
+ "epoch": 3.8843937539187294,
18227
+ "grad_norm": 4.860089302062988,
18228
+ "learning_rate": 0.00016283007331470466,
18229
+ "loss": 1.3152,
18230
+ "step": 130100
18231
+ },
18232
+ {
18233
+ "epoch": 3.8858866031708117,
18234
+ "grad_norm": 3.8552567958831787,
18235
+ "learning_rate": 0.00016281578751878583,
18236
+ "loss": 1.2038,
18237
+ "step": 130150
18238
+ },
18239
+ {
18240
+ "epoch": 3.8873794524228944,
18241
+ "grad_norm": 3.8789255619049072,
18242
+ "learning_rate": 0.000162801501722867,
18243
+ "loss": 1.1898,
18244
+ "step": 130200
18245
+ },
18246
+ {
18247
+ "epoch": 3.8888723016749767,
18248
+ "grad_norm": 5.833735466003418,
18249
+ "learning_rate": 0.00016278721592694815,
18250
+ "loss": 1.327,
18251
+ "step": 130250
18252
+ },
18253
+ {
18254
+ "epoch": 3.8903651509270594,
18255
+ "grad_norm": 5.1781463623046875,
18256
+ "learning_rate": 0.00016277293013102932,
18257
+ "loss": 1.3002,
18258
+ "step": 130300
18259
+ },
18260
+ {
18261
+ "epoch": 3.891858000179142,
18262
+ "grad_norm": 5.102818012237549,
18263
+ "learning_rate": 0.0001627586443351105,
18264
+ "loss": 1.257,
18265
+ "step": 130350
18266
+ },
18267
+ {
18268
+ "epoch": 3.8933508494312243,
18269
+ "grad_norm": 4.642506122589111,
18270
+ "learning_rate": 0.00016274435853919165,
18271
+ "loss": 1.2387,
18272
+ "step": 130400
18273
+ },
18274
+ {
18275
+ "epoch": 3.894843698683307,
18276
+ "grad_norm": 4.327921390533447,
18277
+ "learning_rate": 0.00016273007274327284,
18278
+ "loss": 1.2563,
18279
+ "step": 130450
18280
+ },
18281
+ {
18282
+ "epoch": 3.8963365479353893,
18283
+ "grad_norm": 4.99731969833374,
18284
+ "learning_rate": 0.00016271578694735398,
18285
+ "loss": 1.2621,
18286
+ "step": 130500
18287
+ },
18288
+ {
18289
+ "epoch": 3.897829397187472,
18290
+ "grad_norm": 3.6538639068603516,
18291
+ "learning_rate": 0.00016270150115143517,
18292
+ "loss": 1.2284,
18293
+ "step": 130550
18294
+ },
18295
+ {
18296
+ "epoch": 3.8993222464395547,
18297
+ "grad_norm": 4.081072807312012,
18298
+ "learning_rate": 0.0001626872153555163,
18299
+ "loss": 1.2901,
18300
+ "step": 130600
18301
+ },
18302
+ {
18303
+ "epoch": 3.900815095691637,
18304
+ "grad_norm": 5.384579658508301,
18305
+ "learning_rate": 0.0001626729295595975,
18306
+ "loss": 1.2898,
18307
+ "step": 130650
18308
+ },
18309
+ {
18310
+ "epoch": 3.9023079449437197,
18311
+ "grad_norm": 4.1314520835876465,
18312
+ "learning_rate": 0.00016265864376367866,
18313
+ "loss": 1.2692,
18314
+ "step": 130700
18315
+ },
18316
+ {
18317
+ "epoch": 3.903800794195802,
18318
+ "grad_norm": 5.35874080657959,
18319
+ "learning_rate": 0.00016264435796775983,
18320
+ "loss": 1.2377,
18321
+ "step": 130750
18322
+ },
18323
+ {
18324
+ "epoch": 3.9052936434478847,
18325
+ "grad_norm": 4.041543006896973,
18326
+ "learning_rate": 0.000162630072171841,
18327
+ "loss": 1.2757,
18328
+ "step": 130800
18329
+ },
18330
+ {
18331
+ "epoch": 3.9067864926999674,
18332
+ "grad_norm": 4.784633636474609,
18333
+ "learning_rate": 0.00016261578637592216,
18334
+ "loss": 1.234,
18335
+ "step": 130850
18336
+ },
18337
+ {
18338
+ "epoch": 3.9082793419520496,
18339
+ "grad_norm": 4.274779319763184,
18340
+ "learning_rate": 0.00016260150058000332,
18341
+ "loss": 1.2506,
18342
+ "step": 130900
18343
+ },
18344
+ {
18345
+ "epoch": 3.9097721912041323,
18346
+ "grad_norm": 4.858269214630127,
18347
+ "learning_rate": 0.0001625872147840845,
18348
+ "loss": 1.2345,
18349
+ "step": 130950
18350
+ },
18351
+ {
18352
+ "epoch": 3.9112650404562146,
18353
+ "grad_norm": 3.8560070991516113,
18354
+ "learning_rate": 0.00016257292898816565,
18355
+ "loss": 1.2689,
18356
+ "step": 131000
18357
+ },
18358
+ {
18359
+ "epoch": 3.9127578897082973,
18360
+ "grad_norm": 4.95020866394043,
18361
+ "learning_rate": 0.00016255864319224682,
18362
+ "loss": 1.2156,
18363
+ "step": 131050
18364
+ },
18365
+ {
18366
+ "epoch": 3.91425073896038,
18367
+ "grad_norm": 4.7120795249938965,
18368
+ "learning_rate": 0.00016254435739632798,
18369
+ "loss": 1.3119,
18370
+ "step": 131100
18371
+ },
18372
+ {
18373
+ "epoch": 3.9157435882124623,
18374
+ "grad_norm": 5.613494873046875,
18375
+ "learning_rate": 0.00016253007160040917,
18376
+ "loss": 1.2806,
18377
+ "step": 131150
18378
+ },
18379
+ {
18380
+ "epoch": 3.917236437464545,
18381
+ "grad_norm": 5.0516767501831055,
18382
+ "learning_rate": 0.0001625157858044903,
18383
+ "loss": 1.2254,
18384
+ "step": 131200
18385
+ },
18386
+ {
18387
+ "epoch": 3.9187292867166272,
18388
+ "grad_norm": 4.780202388763428,
18389
+ "learning_rate": 0.0001625015000085715,
18390
+ "loss": 1.2915,
18391
+ "step": 131250
18392
+ },
18393
+ {
18394
+ "epoch": 3.92022213596871,
18395
+ "grad_norm": 5.579587459564209,
18396
+ "learning_rate": 0.00016248721421265264,
18397
+ "loss": 1.2509,
18398
+ "step": 131300
18399
+ },
18400
+ {
18401
+ "epoch": 3.9217149852207926,
18402
+ "grad_norm": 4.543426990509033,
18403
+ "learning_rate": 0.00016247292841673383,
18404
+ "loss": 1.2974,
18405
+ "step": 131350
18406
+ },
18407
+ {
18408
+ "epoch": 3.923207834472875,
18409
+ "grad_norm": 5.310088634490967,
18410
+ "learning_rate": 0.000162458642620815,
18411
+ "loss": 1.2247,
18412
+ "step": 131400
18413
+ },
18414
+ {
18415
+ "epoch": 3.9247006837249576,
18416
+ "grad_norm": 4.900899887084961,
18417
+ "learning_rate": 0.00016244435682489616,
18418
+ "loss": 1.3038,
18419
+ "step": 131450
18420
+ },
18421
+ {
18422
+ "epoch": 3.92619353297704,
18423
+ "grad_norm": 4.816722869873047,
18424
+ "learning_rate": 0.00016243007102897732,
18425
+ "loss": 1.2484,
18426
+ "step": 131500
18427
+ },
18428
+ {
18429
+ "epoch": 3.9276863822291226,
18430
+ "grad_norm": 3.618678092956543,
18431
+ "learning_rate": 0.0001624157852330585,
18432
+ "loss": 1.3475,
18433
+ "step": 131550
18434
+ },
18435
+ {
18436
+ "epoch": 3.9291792314812053,
18437
+ "grad_norm": 5.137662410736084,
18438
+ "learning_rate": 0.00016240149943713965,
18439
+ "loss": 1.2443,
18440
+ "step": 131600
18441
+ },
18442
+ {
18443
+ "epoch": 3.9306720807332876,
18444
+ "grad_norm": 4.798673152923584,
18445
+ "learning_rate": 0.00016238721364122082,
18446
+ "loss": 1.2654,
18447
+ "step": 131650
18448
+ },
18449
+ {
18450
+ "epoch": 3.93216492998537,
18451
+ "grad_norm": 4.472255229949951,
18452
+ "learning_rate": 0.00016237292784530198,
18453
+ "loss": 1.2686,
18454
+ "step": 131700
18455
+ },
18456
+ {
18457
+ "epoch": 3.9336577792374525,
18458
+ "grad_norm": 4.61024284362793,
18459
+ "learning_rate": 0.00016235864204938315,
18460
+ "loss": 1.2491,
18461
+ "step": 131750
18462
+ },
18463
+ {
18464
+ "epoch": 3.9351506284895352,
18465
+ "grad_norm": 4.1179280281066895,
18466
+ "learning_rate": 0.0001623443562534643,
18467
+ "loss": 1.2044,
18468
+ "step": 131800
18469
+ },
18470
+ {
18471
+ "epoch": 3.9366434777416175,
18472
+ "grad_norm": 6.594708442687988,
18473
+ "learning_rate": 0.00016233007045754548,
18474
+ "loss": 1.2111,
18475
+ "step": 131850
18476
+ },
18477
+ {
18478
+ "epoch": 3.9381363269937,
18479
+ "grad_norm": 8.489596366882324,
18480
+ "learning_rate": 0.00016231578466162664,
18481
+ "loss": 1.2011,
18482
+ "step": 131900
18483
+ },
18484
+ {
18485
+ "epoch": 3.9396291762457825,
18486
+ "grad_norm": 5.426854133605957,
18487
+ "learning_rate": 0.0001623014988657078,
18488
+ "loss": 1.268,
18489
+ "step": 131950
18490
+ },
18491
+ {
18492
+ "epoch": 3.941122025497865,
18493
+ "grad_norm": 4.155928611755371,
18494
+ "learning_rate": 0.00016228721306978897,
18495
+ "loss": 1.2722,
18496
+ "step": 132000
18497
+ },
18498
+ {
18499
+ "epoch": 3.942614874749948,
18500
+ "grad_norm": 4.766868591308594,
18501
+ "learning_rate": 0.00016227292727387014,
18502
+ "loss": 1.288,
18503
+ "step": 132050
18504
+ },
18505
+ {
18506
+ "epoch": 3.94410772400203,
18507
+ "grad_norm": 4.16912317276001,
18508
+ "learning_rate": 0.0001622586414779513,
18509
+ "loss": 1.2261,
18510
+ "step": 132100
18511
+ },
18512
+ {
18513
+ "epoch": 3.945600573254113,
18514
+ "grad_norm": 6.41418981552124,
18515
+ "learning_rate": 0.00016224435568203247,
18516
+ "loss": 1.2414,
18517
+ "step": 132150
18518
+ },
18519
+ {
18520
+ "epoch": 3.947093422506195,
18521
+ "grad_norm": 4.736821174621582,
18522
+ "learning_rate": 0.00016223006988611366,
18523
+ "loss": 1.2481,
18524
+ "step": 132200
18525
+ },
18526
+ {
18527
+ "epoch": 3.948586271758278,
18528
+ "grad_norm": 4.794166564941406,
18529
+ "learning_rate": 0.0001622157840901948,
18530
+ "loss": 1.2831,
18531
+ "step": 132250
18532
+ },
18533
+ {
18534
+ "epoch": 3.9500791210103605,
18535
+ "grad_norm": 6.948697090148926,
18536
+ "learning_rate": 0.000162201498294276,
18537
+ "loss": 1.2545,
18538
+ "step": 132300
18539
+ },
18540
+ {
18541
+ "epoch": 3.951571970262443,
18542
+ "grad_norm": 5.114023208618164,
18543
+ "learning_rate": 0.00016218721249835712,
18544
+ "loss": 1.2809,
18545
+ "step": 132350
18546
+ },
18547
+ {
18548
+ "epoch": 3.9530648195145255,
18549
+ "grad_norm": 4.972701072692871,
18550
+ "learning_rate": 0.00016217292670243832,
18551
+ "loss": 1.2143,
18552
+ "step": 132400
18553
+ },
18554
+ {
18555
+ "epoch": 3.9545576687666077,
18556
+ "grad_norm": 3.860616445541382,
18557
+ "learning_rate": 0.00016215864090651945,
18558
+ "loss": 1.2209,
18559
+ "step": 132450
18560
+ },
18561
+ {
18562
+ "epoch": 3.9560505180186905,
18563
+ "grad_norm": 3.6322524547576904,
18564
+ "learning_rate": 0.00016214435511060065,
18565
+ "loss": 1.2252,
18566
+ "step": 132500
18567
+ },
18568
+ {
18569
+ "epoch": 3.957543367270773,
18570
+ "grad_norm": 5.153745174407959,
18571
+ "learning_rate": 0.0001621300693146818,
18572
+ "loss": 1.2748,
18573
+ "step": 132550
18574
+ },
18575
+ {
18576
+ "epoch": 3.9590362165228554,
18577
+ "grad_norm": 4.3787617683410645,
18578
+ "learning_rate": 0.00016211578351876297,
18579
+ "loss": 1.2634,
18580
+ "step": 132600
18581
+ },
18582
+ {
18583
+ "epoch": 3.960529065774938,
18584
+ "grad_norm": 4.335618019104004,
18585
+ "learning_rate": 0.00016210149772284414,
18586
+ "loss": 1.2661,
18587
+ "step": 132650
18588
+ },
18589
+ {
18590
+ "epoch": 3.9620219150270204,
18591
+ "grad_norm": 4.789446830749512,
18592
+ "learning_rate": 0.0001620872119269253,
18593
+ "loss": 1.2877,
18594
+ "step": 132700
18595
+ },
18596
+ {
18597
+ "epoch": 3.963514764279103,
18598
+ "grad_norm": 5.508213996887207,
18599
+ "learning_rate": 0.00016207292613100647,
18600
+ "loss": 1.2759,
18601
+ "step": 132750
18602
+ },
18603
+ {
18604
+ "epoch": 3.965007613531186,
18605
+ "grad_norm": 3.3553807735443115,
18606
+ "learning_rate": 0.00016205864033508763,
18607
+ "loss": 1.2907,
18608
+ "step": 132800
18609
+ },
18610
+ {
18611
+ "epoch": 3.966500462783268,
18612
+ "grad_norm": 4.773813247680664,
18613
+ "learning_rate": 0.0001620443545391688,
18614
+ "loss": 1.3115,
18615
+ "step": 132850
18616
+ },
18617
+ {
18618
+ "epoch": 3.9679933120353508,
18619
+ "grad_norm": 4.2718939781188965,
18620
+ "learning_rate": 0.00016203006874324996,
18621
+ "loss": 1.2467,
18622
+ "step": 132900
18623
+ },
18624
+ {
18625
+ "epoch": 3.969486161287433,
18626
+ "grad_norm": 4.36405086517334,
18627
+ "learning_rate": 0.00016201578294733113,
18628
+ "loss": 1.2536,
18629
+ "step": 132950
18630
+ },
18631
+ {
18632
+ "epoch": 3.9709790105395157,
18633
+ "grad_norm": 5.21968936920166,
18634
+ "learning_rate": 0.00016200149715141232,
18635
+ "loss": 1.3317,
18636
+ "step": 133000
18637
+ },
18638
+ {
18639
+ "epoch": 3.9724718597915984,
18640
+ "grad_norm": 3.792954683303833,
18641
+ "learning_rate": 0.00016198721135549346,
18642
+ "loss": 1.2041,
18643
+ "step": 133050
18644
+ },
18645
+ {
18646
+ "epoch": 3.9739647090436807,
18647
+ "grad_norm": 4.445356369018555,
18648
+ "learning_rate": 0.00016197292555957465,
18649
+ "loss": 1.2283,
18650
+ "step": 133100
18651
+ },
18652
+ {
18653
+ "epoch": 3.9754575582957634,
18654
+ "grad_norm": 4.6043548583984375,
18655
+ "learning_rate": 0.00016195863976365579,
18656
+ "loss": 1.328,
18657
+ "step": 133150
18658
+ },
18659
+ {
18660
+ "epoch": 3.9769504075478457,
18661
+ "grad_norm": 4.0124053955078125,
18662
+ "learning_rate": 0.00016194435396773698,
18663
+ "loss": 1.1772,
18664
+ "step": 133200
18665
+ },
18666
+ {
18667
+ "epoch": 3.9784432567999284,
18668
+ "grad_norm": 4.683640480041504,
18669
+ "learning_rate": 0.00016193006817181811,
18670
+ "loss": 1.2748,
18671
+ "step": 133250
18672
+ },
18673
+ {
18674
+ "epoch": 3.979936106052011,
18675
+ "grad_norm": 5.253026008605957,
18676
+ "learning_rate": 0.0001619157823758993,
18677
+ "loss": 1.2193,
18678
+ "step": 133300
18679
+ },
18680
+ {
18681
+ "epoch": 3.9814289553040934,
18682
+ "grad_norm": 4.60040283203125,
18683
+ "learning_rate": 0.00016190149657998047,
18684
+ "loss": 1.2518,
18685
+ "step": 133350
18686
+ },
18687
+ {
18688
+ "epoch": 3.982921804556176,
18689
+ "grad_norm": 5.973727226257324,
18690
+ "learning_rate": 0.00016188721078406164,
18691
+ "loss": 1.242,
18692
+ "step": 133400
18693
+ },
18694
+ {
18695
+ "epoch": 3.9844146538082583,
18696
+ "grad_norm": 5.09537410736084,
18697
+ "learning_rate": 0.0001618729249881428,
18698
+ "loss": 1.2484,
18699
+ "step": 133450
18700
+ },
18701
+ {
18702
+ "epoch": 3.985907503060341,
18703
+ "grad_norm": 4.428155899047852,
18704
+ "learning_rate": 0.00016185863919222397,
18705
+ "loss": 1.2112,
18706
+ "step": 133500
18707
+ },
18708
+ {
18709
+ "epoch": 3.9874003523124237,
18710
+ "grad_norm": 4.2854838371276855,
18711
+ "learning_rate": 0.00016184435339630513,
18712
+ "loss": 1.2402,
18713
+ "step": 133550
18714
+ },
18715
+ {
18716
+ "epoch": 3.988893201564506,
18717
+ "grad_norm": 4.689759254455566,
18718
+ "learning_rate": 0.0001618300676003863,
18719
+ "loss": 1.2705,
18720
+ "step": 133600
18721
+ },
18722
+ {
18723
+ "epoch": 3.9903860508165887,
18724
+ "grad_norm": 4.272946357727051,
18725
+ "learning_rate": 0.00016181578180446746,
18726
+ "loss": 1.2105,
18727
+ "step": 133650
18728
+ },
18729
+ {
18730
+ "epoch": 3.991878900068671,
18731
+ "grad_norm": 4.82036018371582,
18732
+ "learning_rate": 0.00016180149600854862,
18733
+ "loss": 1.2379,
18734
+ "step": 133700
18735
+ },
18736
+ {
18737
+ "epoch": 3.9933717493207537,
18738
+ "grad_norm": 5.448759078979492,
18739
+ "learning_rate": 0.0001617872102126298,
18740
+ "loss": 1.2077,
18741
+ "step": 133750
18742
+ },
18743
+ {
18744
+ "epoch": 3.9948645985728364,
18745
+ "grad_norm": 3.519653797149658,
18746
+ "learning_rate": 0.00016177292441671098,
18747
+ "loss": 1.2057,
18748
+ "step": 133800
18749
+ },
18750
+ {
18751
+ "epoch": 3.9963574478249186,
18752
+ "grad_norm": 4.229268550872803,
18753
+ "learning_rate": 0.00016175863862079212,
18754
+ "loss": 1.3109,
18755
+ "step": 133850
18756
+ },
18757
+ {
18758
+ "epoch": 3.997850297077001,
18759
+ "grad_norm": 3.772096633911133,
18760
+ "learning_rate": 0.0001617443528248733,
18761
+ "loss": 1.3218,
18762
+ "step": 133900
18763
+ },
18764
+ {
18765
+ "epoch": 3.9993431463290836,
18766
+ "grad_norm": 3.7670326232910156,
18767
+ "learning_rate": 0.00016173006702895445,
18768
+ "loss": 1.2882,
18769
+ "step": 133950
18770
+ },
18771
+ {
18772
+ "epoch": 4.000835995581166,
18773
+ "grad_norm": 4.392240524291992,
18774
+ "learning_rate": 0.00016171578123303564,
18775
+ "loss": 1.2225,
18776
+ "step": 134000
18777
+ },
18778
+ {
18779
+ "epoch": 4.002328844833249,
18780
+ "grad_norm": 4.192978382110596,
18781
+ "learning_rate": 0.00016170149543711678,
18782
+ "loss": 1.1469,
18783
+ "step": 134050
18784
+ },
18785
+ {
18786
+ "epoch": 4.003821694085331,
18787
+ "grad_norm": 4.702342987060547,
18788
+ "learning_rate": 0.00016168720964119797,
18789
+ "loss": 1.1452,
18790
+ "step": 134100
18791
+ },
18792
+ {
18793
+ "epoch": 4.0053145433374135,
18794
+ "grad_norm": 4.7361555099487305,
18795
+ "learning_rate": 0.00016167292384527913,
18796
+ "loss": 1.1871,
18797
+ "step": 134150
18798
+ },
18799
+ {
18800
+ "epoch": 4.006807392589496,
18801
+ "grad_norm": 3.4626450538635254,
18802
+ "learning_rate": 0.0001616586380493603,
18803
+ "loss": 1.159,
18804
+ "step": 134200
18805
+ },
18806
+ {
18807
+ "epoch": 4.008300241841579,
18808
+ "grad_norm": 4.5900654792785645,
18809
+ "learning_rate": 0.00016164435225344146,
18810
+ "loss": 1.2012,
18811
+ "step": 134250
18812
+ },
18813
+ {
18814
+ "epoch": 4.009793091093662,
18815
+ "grad_norm": 6.448458194732666,
18816
+ "learning_rate": 0.00016163006645752263,
18817
+ "loss": 1.1656,
18818
+ "step": 134300
18819
+ },
18820
+ {
18821
+ "epoch": 4.0112859403457435,
18822
+ "grad_norm": 4.073904991149902,
18823
+ "learning_rate": 0.0001616157806616038,
18824
+ "loss": 1.131,
18825
+ "step": 134350
18826
+ },
18827
+ {
18828
+ "epoch": 4.012778789597826,
18829
+ "grad_norm": 5.392414569854736,
18830
+ "learning_rate": 0.00016160149486568496,
18831
+ "loss": 1.1462,
18832
+ "step": 134400
18833
+ },
18834
+ {
18835
+ "epoch": 4.014271638849909,
18836
+ "grad_norm": 4.959211349487305,
18837
+ "learning_rate": 0.00016158720906976612,
18838
+ "loss": 1.1275,
18839
+ "step": 134450
18840
+ },
18841
+ {
18842
+ "epoch": 4.015764488101992,
18843
+ "grad_norm": 4.862436771392822,
18844
+ "learning_rate": 0.00016157292327384729,
18845
+ "loss": 1.1395,
18846
+ "step": 134500
18847
+ },
18848
+ {
18849
+ "epoch": 4.017257337354074,
18850
+ "grad_norm": 4.025382995605469,
18851
+ "learning_rate": 0.00016155863747792845,
18852
+ "loss": 1.1529,
18853
+ "step": 134550
18854
+ },
18855
+ {
18856
+ "epoch": 4.018750186606156,
18857
+ "grad_norm": 5.5243449211120605,
18858
+ "learning_rate": 0.00016154435168200961,
18859
+ "loss": 1.1552,
18860
+ "step": 134600
18861
+ },
18862
+ {
18863
+ "epoch": 4.020243035858239,
18864
+ "grad_norm": 5.48097038269043,
18865
+ "learning_rate": 0.00016153006588609078,
18866
+ "loss": 1.1542,
18867
+ "step": 134650
18868
+ },
18869
+ {
18870
+ "epoch": 4.0217358851103215,
18871
+ "grad_norm": 5.017364978790283,
18872
+ "learning_rate": 0.00016151578009017194,
18873
+ "loss": 1.1294,
18874
+ "step": 134700
18875
+ },
18876
+ {
18877
+ "epoch": 4.023228734362404,
18878
+ "grad_norm": 3.903317451477051,
18879
+ "learning_rate": 0.0001615014942942531,
18880
+ "loss": 1.1743,
18881
+ "step": 134750
18882
+ },
18883
+ {
18884
+ "epoch": 4.024721583614487,
18885
+ "grad_norm": 5.144746780395508,
18886
+ "learning_rate": 0.00016148720849833427,
18887
+ "loss": 1.184,
18888
+ "step": 134800
18889
+ },
18890
+ {
18891
+ "epoch": 4.026214432866569,
18892
+ "grad_norm": 3.201718330383301,
18893
+ "learning_rate": 0.00016147292270241547,
18894
+ "loss": 1.1294,
18895
+ "step": 134850
18896
+ },
18897
+ {
18898
+ "epoch": 4.0277072821186515,
18899
+ "grad_norm": 5.051429748535156,
18900
+ "learning_rate": 0.0001614586369064966,
18901
+ "loss": 1.1625,
18902
+ "step": 134900
18903
+ },
18904
+ {
18905
+ "epoch": 4.029200131370734,
18906
+ "grad_norm": 4.083528995513916,
18907
+ "learning_rate": 0.0001614443511105778,
18908
+ "loss": 1.1865,
18909
+ "step": 134950
18910
+ },
18911
+ {
18912
+ "epoch": 4.030692980622817,
18913
+ "grad_norm": 4.694931983947754,
18914
+ "learning_rate": 0.00016143006531465893,
18915
+ "loss": 1.1607,
18916
+ "step": 135000
18917
+ },
18918
+ {
18919
+ "epoch": 4.0321858298749,
18920
+ "grad_norm": 3.7577016353607178,
18921
+ "learning_rate": 0.00016141577951874012,
18922
+ "loss": 1.2009,
18923
+ "step": 135050
18924
+ },
18925
+ {
18926
+ "epoch": 4.033678679126981,
18927
+ "grad_norm": 3.913961887359619,
18928
+ "learning_rate": 0.00016140149372282126,
18929
+ "loss": 1.148,
18930
+ "step": 135100
18931
+ },
18932
+ {
18933
+ "epoch": 4.035171528379064,
18934
+ "grad_norm": 4.2388129234313965,
18935
+ "learning_rate": 0.00016138720792690245,
18936
+ "loss": 1.1673,
18937
+ "step": 135150
18938
+ },
18939
+ {
18940
+ "epoch": 4.036664377631147,
18941
+ "grad_norm": 4.703933238983154,
18942
+ "learning_rate": 0.00016137292213098362,
18943
+ "loss": 1.1249,
18944
+ "step": 135200
18945
+ },
18946
+ {
18947
+ "epoch": 4.0381572268832295,
18948
+ "grad_norm": 5.607854843139648,
18949
+ "learning_rate": 0.00016135863633506478,
18950
+ "loss": 1.1609,
18951
+ "step": 135250
18952
+ },
18953
+ {
18954
+ "epoch": 4.039650076135312,
18955
+ "grad_norm": 4.3828840255737305,
18956
+ "learning_rate": 0.00016134435053914595,
18957
+ "loss": 1.1788,
18958
+ "step": 135300
18959
+ },
18960
+ {
18961
+ "epoch": 4.041142925387394,
18962
+ "grad_norm": 5.778615474700928,
18963
+ "learning_rate": 0.0001613300647432271,
18964
+ "loss": 1.2101,
18965
+ "step": 135350
18966
+ },
18967
+ {
18968
+ "epoch": 4.042635774639477,
18969
+ "grad_norm": 4.011424541473389,
18970
+ "learning_rate": 0.00016131577894730828,
18971
+ "loss": 1.1991,
18972
+ "step": 135400
18973
+ },
18974
+ {
18975
+ "epoch": 4.0441286238915595,
18976
+ "grad_norm": 5.844552993774414,
18977
+ "learning_rate": 0.00016130149315138944,
18978
+ "loss": 1.2172,
18979
+ "step": 135450
18980
+ },
18981
+ {
18982
+ "epoch": 4.045621473143642,
18983
+ "grad_norm": 4.851437091827393,
18984
+ "learning_rate": 0.0001612872073554706,
18985
+ "loss": 1.2379,
18986
+ "step": 135500
18987
+ },
18988
+ {
18989
+ "epoch": 4.047114322395725,
18990
+ "grad_norm": 6.846874713897705,
18991
+ "learning_rate": 0.00016127292155955177,
18992
+ "loss": 1.1666,
18993
+ "step": 135550
18994
+ },
18995
+ {
18996
+ "epoch": 4.048607171647807,
18997
+ "grad_norm": 3.9584429264068604,
18998
+ "learning_rate": 0.00016125863576363294,
18999
+ "loss": 1.191,
19000
+ "step": 135600
19001
+ },
19002
+ {
19003
+ "epoch": 4.050100020899889,
19004
+ "grad_norm": 3.547799587249756,
19005
+ "learning_rate": 0.00016124434996771413,
19006
+ "loss": 1.234,
19007
+ "step": 135650
19008
+ },
19009
+ {
19010
+ "epoch": 4.051592870151972,
19011
+ "grad_norm": 4.4900641441345215,
19012
+ "learning_rate": 0.00016123006417179526,
19013
+ "loss": 1.1759,
19014
+ "step": 135700
19015
+ },
19016
+ {
19017
+ "epoch": 4.053085719404055,
19018
+ "grad_norm": 4.560009479522705,
19019
+ "learning_rate": 0.00016121577837587646,
19020
+ "loss": 1.1759,
19021
+ "step": 135750
19022
+ },
19023
+ {
19024
+ "epoch": 4.0545785686561375,
19025
+ "grad_norm": 4.02714204788208,
19026
+ "learning_rate": 0.0001612014925799576,
19027
+ "loss": 1.1788,
19028
+ "step": 135800
19029
+ },
19030
+ {
19031
+ "epoch": 4.056071417908219,
19032
+ "grad_norm": 4.854091644287109,
19033
+ "learning_rate": 0.00016118720678403879,
19034
+ "loss": 1.1621,
19035
+ "step": 135850
19036
+ },
19037
+ {
19038
+ "epoch": 4.057564267160302,
19039
+ "grad_norm": 4.28786563873291,
19040
+ "learning_rate": 0.00016117292098811992,
19041
+ "loss": 1.1694,
19042
+ "step": 135900
19043
+ },
19044
+ {
19045
+ "epoch": 4.059057116412385,
19046
+ "grad_norm": 5.054505825042725,
19047
+ "learning_rate": 0.00016115863519220111,
19048
+ "loss": 1.2124,
19049
+ "step": 135950
19050
+ },
19051
+ {
19052
+ "epoch": 4.0605499656644675,
19053
+ "grad_norm": 4.706223011016846,
19054
+ "learning_rate": 0.00016114434939628228,
19055
+ "loss": 1.1867,
19056
+ "step": 136000
19057
+ },
19058
+ {
19059
+ "epoch": 4.06204281491655,
19060
+ "grad_norm": 3.443118095397949,
19061
+ "learning_rate": 0.00016113006360036344,
19062
+ "loss": 1.1398,
19063
+ "step": 136050
19064
+ },
19065
+ {
19066
+ "epoch": 4.063535664168632,
19067
+ "grad_norm": 5.108642578125,
19068
+ "learning_rate": 0.0001611157778044446,
19069
+ "loss": 1.1902,
19070
+ "step": 136100
19071
+ },
19072
+ {
19073
+ "epoch": 4.065028513420715,
19074
+ "grad_norm": 5.258908271789551,
19075
+ "learning_rate": 0.00016110149200852577,
19076
+ "loss": 1.2093,
19077
+ "step": 136150
19078
+ },
19079
+ {
19080
+ "epoch": 4.066521362672797,
19081
+ "grad_norm": 3.8509016036987305,
19082
+ "learning_rate": 0.00016108720621260694,
19083
+ "loss": 1.1354,
19084
+ "step": 136200
19085
+ },
19086
+ {
19087
+ "epoch": 4.06801421192488,
19088
+ "grad_norm": 5.995665073394775,
19089
+ "learning_rate": 0.0001610729204166881,
19090
+ "loss": 1.1516,
19091
+ "step": 136250
19092
+ },
19093
+ {
19094
+ "epoch": 4.069507061176962,
19095
+ "grad_norm": 4.4564738273620605,
19096
+ "learning_rate": 0.00016105863462076927,
19097
+ "loss": 1.1405,
19098
+ "step": 136300
19099
+ },
19100
+ {
19101
+ "epoch": 4.070999910429045,
19102
+ "grad_norm": 7.378493309020996,
19103
+ "learning_rate": 0.00016104434882485043,
19104
+ "loss": 1.1817,
19105
+ "step": 136350
19106
+ },
19107
+ {
19108
+ "epoch": 4.072492759681127,
19109
+ "grad_norm": 4.1566009521484375,
19110
+ "learning_rate": 0.0001610300630289316,
19111
+ "loss": 1.1988,
19112
+ "step": 136400
19113
+ },
19114
+ {
19115
+ "epoch": 4.07398560893321,
19116
+ "grad_norm": 4.735561370849609,
19117
+ "learning_rate": 0.0001610157772330128,
19118
+ "loss": 1.1847,
19119
+ "step": 136450
19120
+ },
19121
+ {
19122
+ "epoch": 4.075478458185293,
19123
+ "grad_norm": 6.040157318115234,
19124
+ "learning_rate": 0.00016100149143709393,
19125
+ "loss": 1.1762,
19126
+ "step": 136500
19127
+ },
19128
+ {
19129
+ "epoch": 4.076971307437375,
19130
+ "grad_norm": 4.695454120635986,
19131
+ "learning_rate": 0.00016098720564117512,
19132
+ "loss": 1.2151,
19133
+ "step": 136550
19134
+ },
19135
+ {
19136
+ "epoch": 4.078464156689457,
19137
+ "grad_norm": 5.111282825469971,
19138
+ "learning_rate": 0.00016097291984525626,
19139
+ "loss": 1.2581,
19140
+ "step": 136600
19141
+ },
19142
+ {
19143
+ "epoch": 4.07995700594154,
19144
+ "grad_norm": 6.601406574249268,
19145
+ "learning_rate": 0.00016095863404933745,
19146
+ "loss": 1.1902,
19147
+ "step": 136650
19148
+ },
19149
+ {
19150
+ "epoch": 4.081449855193623,
19151
+ "grad_norm": 4.173478126525879,
19152
+ "learning_rate": 0.00016094434825341858,
19153
+ "loss": 1.1809,
19154
+ "step": 136700
19155
+ },
19156
+ {
19157
+ "epoch": 4.082942704445705,
19158
+ "grad_norm": 4.189050674438477,
19159
+ "learning_rate": 0.00016093006245749978,
19160
+ "loss": 1.1837,
19161
+ "step": 136750
19162
+ },
19163
+ {
19164
+ "epoch": 4.084435553697787,
19165
+ "grad_norm": 4.181232929229736,
19166
+ "learning_rate": 0.00016091577666158094,
19167
+ "loss": 1.124,
19168
+ "step": 136800
19169
+ },
19170
+ {
19171
+ "epoch": 4.08592840294987,
19172
+ "grad_norm": 6.015085697174072,
19173
+ "learning_rate": 0.0001609014908656621,
19174
+ "loss": 1.2253,
19175
+ "step": 136850
19176
+ },
19177
+ {
19178
+ "epoch": 4.087421252201953,
19179
+ "grad_norm": 4.773750305175781,
19180
+ "learning_rate": 0.00016088720506974327,
19181
+ "loss": 1.1498,
19182
+ "step": 136900
19183
+ },
19184
+ {
19185
+ "epoch": 4.088914101454035,
19186
+ "grad_norm": 4.635052680969238,
19187
+ "learning_rate": 0.00016087291927382443,
19188
+ "loss": 1.1946,
19189
+ "step": 136950
19190
+ },
19191
+ {
19192
+ "epoch": 4.090406950706118,
19193
+ "grad_norm": 3.5755093097686768,
19194
+ "learning_rate": 0.0001608586334779056,
19195
+ "loss": 1.178,
19196
+ "step": 137000
19197
+ },
19198
+ {
19199
+ "epoch": 4.0918997999582,
19200
+ "grad_norm": 5.496055603027344,
19201
+ "learning_rate": 0.00016084434768198676,
19202
+ "loss": 1.1409,
19203
+ "step": 137050
19204
+ },
19205
+ {
19206
+ "epoch": 4.093392649210283,
19207
+ "grad_norm": 4.117304801940918,
19208
+ "learning_rate": 0.00016083006188606793,
19209
+ "loss": 1.2,
19210
+ "step": 137100
19211
+ },
19212
+ {
19213
+ "epoch": 4.094885498462365,
19214
+ "grad_norm": 5.538839817047119,
19215
+ "learning_rate": 0.0001608157760901491,
19216
+ "loss": 1.1778,
19217
+ "step": 137150
19218
+ },
19219
+ {
19220
+ "epoch": 4.096378347714448,
19221
+ "grad_norm": 4.6166558265686035,
19222
+ "learning_rate": 0.00016080149029423026,
19223
+ "loss": 1.1797,
19224
+ "step": 137200
19225
+ },
19226
+ {
19227
+ "epoch": 4.097871196966531,
19228
+ "grad_norm": 4.904277324676514,
19229
+ "learning_rate": 0.00016078720449831142,
19230
+ "loss": 1.1951,
19231
+ "step": 137250
19232
+ },
19233
+ {
19234
+ "epoch": 4.0993640462186125,
19235
+ "grad_norm": 4.139499664306641,
19236
+ "learning_rate": 0.0001607729187023926,
19237
+ "loss": 1.1909,
19238
+ "step": 137300
19239
+ },
19240
+ {
19241
+ "epoch": 4.100856895470695,
19242
+ "grad_norm": 4.454082489013672,
19243
+ "learning_rate": 0.00016075863290647375,
19244
+ "loss": 1.1944,
19245
+ "step": 137350
19246
+ },
19247
+ {
19248
+ "epoch": 4.102349744722778,
19249
+ "grad_norm": 6.1854963302612305,
19250
+ "learning_rate": 0.00016074434711055492,
19251
+ "loss": 1.1934,
19252
+ "step": 137400
19253
+ },
19254
+ {
19255
+ "epoch": 4.103842593974861,
19256
+ "grad_norm": 3.4759559631347656,
19257
+ "learning_rate": 0.00016073006131463608,
19258
+ "loss": 1.1489,
19259
+ "step": 137450
19260
+ },
19261
+ {
19262
+ "epoch": 4.105335443226943,
19263
+ "grad_norm": 5.522613525390625,
19264
+ "learning_rate": 0.00016071577551871727,
19265
+ "loss": 1.1765,
19266
+ "step": 137500
19267
+ },
19268
+ {
19269
+ "epoch": 4.106828292479025,
19270
+ "grad_norm": 4.545987606048584,
19271
+ "learning_rate": 0.0001607014897227984,
19272
+ "loss": 1.2346,
19273
+ "step": 137550
19274
+ },
19275
+ {
19276
+ "epoch": 4.108321141731108,
19277
+ "grad_norm": 3.9791605472564697,
19278
+ "learning_rate": 0.0001606872039268796,
19279
+ "loss": 1.1124,
19280
+ "step": 137600
19281
+ },
19282
+ {
19283
+ "epoch": 4.109813990983191,
19284
+ "grad_norm": 5.259634971618652,
19285
+ "learning_rate": 0.00016067291813096074,
19286
+ "loss": 1.2323,
19287
+ "step": 137650
19288
+ },
19289
+ {
19290
+ "epoch": 4.111306840235273,
19291
+ "grad_norm": 3.549207925796509,
19292
+ "learning_rate": 0.00016065863233504193,
19293
+ "loss": 1.2269,
19294
+ "step": 137700
19295
+ },
19296
+ {
19297
+ "epoch": 4.112799689487356,
19298
+ "grad_norm": 4.890096664428711,
19299
+ "learning_rate": 0.00016064434653912307,
19300
+ "loss": 1.142,
19301
+ "step": 137750
19302
+ },
19303
+ {
19304
+ "epoch": 4.114292538739438,
19305
+ "grad_norm": 3.7870566844940186,
19306
+ "learning_rate": 0.00016063006074320426,
19307
+ "loss": 1.1446,
19308
+ "step": 137800
19309
+ },
19310
+ {
19311
+ "epoch": 4.1157853879915205,
19312
+ "grad_norm": 5.0174689292907715,
19313
+ "learning_rate": 0.00016061577494728543,
19314
+ "loss": 1.2022,
19315
+ "step": 137850
19316
+ },
19317
+ {
19318
+ "epoch": 4.117278237243603,
19319
+ "grad_norm": 4.542319297790527,
19320
+ "learning_rate": 0.0001606014891513666,
19321
+ "loss": 1.1809,
19322
+ "step": 137900
19323
+ },
19324
+ {
19325
+ "epoch": 4.118771086495686,
19326
+ "grad_norm": 5.590012073516846,
19327
+ "learning_rate": 0.00016058720335544776,
19328
+ "loss": 1.1811,
19329
+ "step": 137950
19330
+ },
19331
+ {
19332
+ "epoch": 4.120263935747769,
19333
+ "grad_norm": 4.569715976715088,
19334
+ "learning_rate": 0.00016057291755952892,
19335
+ "loss": 1.1715,
19336
+ "step": 138000
19337
+ },
19338
+ {
19339
+ "epoch": 4.12175678499985,
19340
+ "grad_norm": 3.725158214569092,
19341
+ "learning_rate": 0.00016055863176361008,
19342
+ "loss": 1.1825,
19343
+ "step": 138050
19344
+ },
19345
+ {
19346
+ "epoch": 4.123249634251933,
19347
+ "grad_norm": 3.941608428955078,
19348
+ "learning_rate": 0.00016054434596769125,
19349
+ "loss": 1.1482,
19350
+ "step": 138100
19351
+ },
19352
+ {
19353
+ "epoch": 4.124742483504016,
19354
+ "grad_norm": 4.545751094818115,
19355
+ "learning_rate": 0.00016053006017177241,
19356
+ "loss": 1.2089,
19357
+ "step": 138150
19358
+ },
19359
+ {
19360
+ "epoch": 4.1262353327560986,
19361
+ "grad_norm": 6.809751033782959,
19362
+ "learning_rate": 0.00016051577437585358,
19363
+ "loss": 1.1664,
19364
+ "step": 138200
19365
+ },
19366
+ {
19367
+ "epoch": 4.12772818200818,
19368
+ "grad_norm": 5.444116592407227,
19369
+ "learning_rate": 0.00016050148857993474,
19370
+ "loss": 1.2127,
19371
+ "step": 138250
19372
+ },
19373
+ {
19374
+ "epoch": 4.129221031260263,
19375
+ "grad_norm": 4.276831150054932,
19376
+ "learning_rate": 0.00016048720278401593,
19377
+ "loss": 1.2138,
19378
+ "step": 138300
19379
+ },
19380
+ {
19381
+ "epoch": 4.130713880512346,
19382
+ "grad_norm": 3.9716556072235107,
19383
+ "learning_rate": 0.00016047291698809707,
19384
+ "loss": 1.2237,
19385
+ "step": 138350
19386
+ },
19387
+ {
19388
+ "epoch": 4.1322067297644285,
19389
+ "grad_norm": 3.9793570041656494,
19390
+ "learning_rate": 0.00016045863119217826,
19391
+ "loss": 1.1997,
19392
+ "step": 138400
19393
+ },
19394
+ {
19395
+ "epoch": 4.133699579016511,
19396
+ "grad_norm": 4.590352535247803,
19397
+ "learning_rate": 0.0001604443453962594,
19398
+ "loss": 1.1631,
19399
+ "step": 138450
19400
+ },
19401
+ {
19402
+ "epoch": 4.135192428268593,
19403
+ "grad_norm": 3.9549121856689453,
19404
+ "learning_rate": 0.0001604300596003406,
19405
+ "loss": 1.1518,
19406
+ "step": 138500
19407
+ },
19408
+ {
19409
+ "epoch": 4.136685277520676,
19410
+ "grad_norm": 5.076075077056885,
19411
+ "learning_rate": 0.00016041577380442173,
19412
+ "loss": 1.1806,
19413
+ "step": 138550
19414
+ },
19415
+ {
19416
+ "epoch": 4.138178126772758,
19417
+ "grad_norm": 6.228057384490967,
19418
+ "learning_rate": 0.00016040148800850292,
19419
+ "loss": 1.1547,
19420
+ "step": 138600
19421
+ },
19422
+ {
19423
+ "epoch": 4.139670976024841,
19424
+ "grad_norm": 4.276813507080078,
19425
+ "learning_rate": 0.0001603872022125841,
19426
+ "loss": 1.1351,
19427
+ "step": 138650
19428
+ },
19429
+ {
19430
+ "epoch": 4.141163825276924,
19431
+ "grad_norm": 5.215205669403076,
19432
+ "learning_rate": 0.00016037291641666525,
19433
+ "loss": 1.2021,
19434
+ "step": 138700
19435
+ },
19436
+ {
19437
+ "epoch": 4.142656674529006,
19438
+ "grad_norm": 4.386962413787842,
19439
+ "learning_rate": 0.00016035863062074642,
19440
+ "loss": 1.2346,
19441
+ "step": 138750
19442
+ },
19443
+ {
19444
+ "epoch": 4.144149523781088,
19445
+ "grad_norm": 4.739329814910889,
19446
+ "learning_rate": 0.00016034434482482758,
19447
+ "loss": 1.1971,
19448
+ "step": 138800
19449
+ },
19450
+ {
19451
+ "epoch": 4.145642373033171,
19452
+ "grad_norm": 3.994267225265503,
19453
+ "learning_rate": 0.00016033005902890875,
19454
+ "loss": 1.2259,
19455
+ "step": 138850
19456
+ },
19457
+ {
19458
+ "epoch": 4.147135222285254,
19459
+ "grad_norm": 4.481820106506348,
19460
+ "learning_rate": 0.0001603157732329899,
19461
+ "loss": 1.2428,
19462
+ "step": 138900
19463
+ },
19464
+ {
19465
+ "epoch": 4.1486280715373365,
19466
+ "grad_norm": 5.454226016998291,
19467
+ "learning_rate": 0.00016030148743707108,
19468
+ "loss": 1.1596,
19469
+ "step": 138950
19470
+ },
19471
+ {
19472
+ "epoch": 4.150120920789418,
19473
+ "grad_norm": 5.072449684143066,
19474
+ "learning_rate": 0.00016028720164115224,
19475
+ "loss": 1.1755,
19476
+ "step": 139000
19477
+ },
19478
+ {
19479
+ "epoch": 4.151613770041501,
19480
+ "grad_norm": 4.940500259399414,
19481
+ "learning_rate": 0.0001602729158452334,
19482
+ "loss": 1.1765,
19483
+ "step": 139050
19484
+ },
19485
+ {
19486
+ "epoch": 4.153106619293584,
19487
+ "grad_norm": 4.294040203094482,
19488
+ "learning_rate": 0.0001602586300493146,
19489
+ "loss": 1.1638,
19490
+ "step": 139100
19491
+ },
19492
+ {
19493
+ "epoch": 4.154599468545666,
19494
+ "grad_norm": 3.98103928565979,
19495
+ "learning_rate": 0.00016024434425339573,
19496
+ "loss": 1.192,
19497
+ "step": 139150
19498
+ },
19499
+ {
19500
+ "epoch": 4.156092317797749,
19501
+ "grad_norm": 5.18140983581543,
19502
+ "learning_rate": 0.00016023005845747693,
19503
+ "loss": 1.2121,
19504
+ "step": 139200
19505
+ },
19506
+ {
19507
+ "epoch": 4.157585167049831,
19508
+ "grad_norm": 4.342568874359131,
19509
+ "learning_rate": 0.00016021577266155806,
19510
+ "loss": 1.2056,
19511
+ "step": 139250
19512
+ },
19513
+ {
19514
+ "epoch": 4.159078016301914,
19515
+ "grad_norm": 4.284645080566406,
19516
+ "learning_rate": 0.00016020148686563926,
19517
+ "loss": 1.1864,
19518
+ "step": 139300
19519
+ },
19520
+ {
19521
+ "epoch": 4.160570865553996,
19522
+ "grad_norm": 5.542572498321533,
19523
+ "learning_rate": 0.0001601872010697204,
19524
+ "loss": 1.1903,
19525
+ "step": 139350
19526
+ },
19527
+ {
19528
+ "epoch": 4.162063714806079,
19529
+ "grad_norm": 4.752786159515381,
19530
+ "learning_rate": 0.00016017291527380158,
19531
+ "loss": 1.2012,
19532
+ "step": 139400
19533
+ },
19534
+ {
19535
+ "epoch": 4.163556564058162,
19536
+ "grad_norm": 4.298015117645264,
19537
+ "learning_rate": 0.00016015862947788275,
19538
+ "loss": 1.2108,
19539
+ "step": 139450
19540
+ },
19541
+ {
19542
+ "epoch": 4.165049413310244,
19543
+ "grad_norm": 4.048568248748779,
19544
+ "learning_rate": 0.00016014434368196391,
19545
+ "loss": 1.2191,
19546
+ "step": 139500
19547
+ },
19548
+ {
19549
+ "epoch": 4.166542262562326,
19550
+ "grad_norm": 4.178681373596191,
19551
+ "learning_rate": 0.00016013005788604508,
19552
+ "loss": 1.1892,
19553
+ "step": 139550
19554
+ },
19555
+ {
19556
+ "epoch": 4.168035111814409,
19557
+ "grad_norm": 6.201132297515869,
19558
+ "learning_rate": 0.00016011577209012622,
19559
+ "loss": 1.1917,
19560
+ "step": 139600
19561
+ },
19562
+ {
19563
+ "epoch": 4.169527961066492,
19564
+ "grad_norm": 4.938151836395264,
19565
+ "learning_rate": 0.0001601014862942074,
19566
+ "loss": 1.1954,
19567
+ "step": 139650
19568
+ },
19569
+ {
19570
+ "epoch": 4.171020810318574,
19571
+ "grad_norm": 6.060943126678467,
19572
+ "learning_rate": 0.00016008720049828855,
19573
+ "loss": 1.1471,
19574
+ "step": 139700
19575
+ },
19576
+ {
19577
+ "epoch": 4.172513659570656,
19578
+ "grad_norm": 5.0269880294799805,
19579
+ "learning_rate": 0.00016007291470236974,
19580
+ "loss": 1.1868,
19581
+ "step": 139750
19582
+ },
19583
+ {
19584
+ "epoch": 4.174006508822739,
19585
+ "grad_norm": 4.509561538696289,
19586
+ "learning_rate": 0.0001600586289064509,
19587
+ "loss": 1.1906,
19588
+ "step": 139800
19589
+ },
19590
+ {
19591
+ "epoch": 4.175499358074822,
19592
+ "grad_norm": 4.10978364944458,
19593
+ "learning_rate": 0.00016004434311053207,
19594
+ "loss": 1.1515,
19595
+ "step": 139850
19596
+ },
19597
+ {
19598
+ "epoch": 4.176992207326904,
19599
+ "grad_norm": 3.7452332973480225,
19600
+ "learning_rate": 0.00016003005731461323,
19601
+ "loss": 1.1807,
19602
+ "step": 139900
19603
+ },
19604
+ {
19605
+ "epoch": 4.178485056578987,
19606
+ "grad_norm": 4.911968231201172,
19607
+ "learning_rate": 0.0001600157715186944,
19608
+ "loss": 1.1568,
19609
+ "step": 139950
19610
+ },
19611
+ {
19612
+ "epoch": 4.179977905831069,
19613
+ "grad_norm": 4.606587886810303,
19614
+ "learning_rate": 0.00016000148572277556,
19615
+ "loss": 1.2099,
19616
+ "step": 140000
19617
  }
19618
  ],
19619
  "logging_steps": 50,
 
19633
  "attributes": {}
19634
  }
19635
  },
19636
+ "total_flos": 3.536269939317932e+18,
19637
  "train_batch_size": 2,
19638
  "trial_name": null,
19639
  "trial_params": null