Training in progress, step 19500, checkpoint
Browse files
.gitattributes
CHANGED
@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
37 |
checkpoint-19500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
37 |
checkpoint-19500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
38 |
+
last-checkpoint/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
last-checkpoint/adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 3541119728
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dec21d65cc35c5b2575c9f28438a7f3e71903280791cff6024d6130cc74f123b
|
3 |
size 3541119728
|
last-checkpoint/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 778374186
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2251652726c40e6dfcb11baba340311083aebf1f86c81a23f7dc9d79eac124bb
|
3 |
size 778374186
|
last-checkpoint/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ff6a16ef822fb8170c7c05fd4d1180b525bf072c302b69d261a30b0549778c78
|
3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
@@ -2,9 +2,9 @@
|
|
2 |
"best_global_step": null,
|
3 |
"best_metric": null,
|
4 |
"best_model_checkpoint": null,
|
5 |
-
"epoch": 0.
|
6 |
"eval_steps": 500,
|
7 |
-
"global_step":
|
8 |
"is_hyper_param_search": false,
|
9 |
"is_local_process_zero": true,
|
10 |
"is_world_process_zero": true,
|
@@ -17288,6 +17288,276 @@
|
|
17288 |
"mean_token_accuracy": 0.860031221807003,
|
17289 |
"num_tokens": 31894612.0,
|
17290 |
"step": 19200
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17291 |
}
|
17292 |
],
|
17293 |
"logging_steps": 10,
|
@@ -17307,7 +17577,7 @@
|
|
17307 |
"attributes": {}
|
17308 |
}
|
17309 |
},
|
17310 |
-
"total_flos": 7.
|
17311 |
"train_batch_size": 2,
|
17312 |
"trial_name": null,
|
17313 |
"trial_params": null
|
|
|
2 |
"best_global_step": null,
|
3 |
"best_metric": null,
|
4 |
"best_model_checkpoint": null,
|
5 |
+
"epoch": 0.9305765995776614,
|
6 |
"eval_steps": 500,
|
7 |
+
"global_step": 19500,
|
8 |
"is_hyper_param_search": false,
|
9 |
"is_local_process_zero": true,
|
10 |
"is_world_process_zero": true,
|
|
|
17288 |
"mean_token_accuracy": 0.860031221807003,
|
17289 |
"num_tokens": 31894612.0,
|
17290 |
"step": 19200
|
17291 |
+
},
|
17292 |
+
{
|
17293 |
+
"epoch": 0.91673725527625,
|
17294 |
+
"grad_norm": 0.37166985869407654,
|
17295 |
+
"learning_rate": 1.0833214030064425e-05,
|
17296 |
+
"loss": 0.5797,
|
17297 |
+
"mean_token_accuracy": 0.8837983384728432,
|
17298 |
+
"num_tokens": 31910332.0,
|
17299 |
+
"step": 19210
|
17300 |
+
},
|
17301 |
+
{
|
17302 |
+
"epoch": 0.9172144740452642,
|
17303 |
+
"grad_norm": 0.3820003271102905,
|
17304 |
+
"learning_rate": 1.0828441899308041e-05,
|
17305 |
+
"loss": 0.611,
|
17306 |
+
"mean_token_accuracy": 0.8845193833112717,
|
17307 |
+
"num_tokens": 31926866.0,
|
17308 |
+
"step": 19220
|
17309 |
+
},
|
17310 |
+
{
|
17311 |
+
"epoch": 0.9176916928142784,
|
17312 |
+
"grad_norm": 0.3208041489124298,
|
17313 |
+
"learning_rate": 1.082366976855166e-05,
|
17314 |
+
"loss": 0.6218,
|
17315 |
+
"mean_token_accuracy": 0.8753577992320061,
|
17316 |
+
"num_tokens": 31944162.0,
|
17317 |
+
"step": 19230
|
17318 |
+
},
|
17319 |
+
{
|
17320 |
+
"epoch": 0.9181689115832926,
|
17321 |
+
"grad_norm": 0.4074268639087677,
|
17322 |
+
"learning_rate": 1.0818897637795276e-05,
|
17323 |
+
"loss": 0.5728,
|
17324 |
+
"mean_token_accuracy": 0.8923567131161689,
|
17325 |
+
"num_tokens": 31959882.0,
|
17326 |
+
"step": 19240
|
17327 |
+
},
|
17328 |
+
{
|
17329 |
+
"epoch": 0.9186461303523068,
|
17330 |
+
"grad_norm": 0.4497404992580414,
|
17331 |
+
"learning_rate": 1.0814125507038893e-05,
|
17332 |
+
"loss": 0.5369,
|
17333 |
+
"mean_token_accuracy": 0.8921236410737038,
|
17334 |
+
"num_tokens": 31974914.0,
|
17335 |
+
"step": 19250
|
17336 |
+
},
|
17337 |
+
{
|
17338 |
+
"epoch": 0.919123349121321,
|
17339 |
+
"grad_norm": 0.38203802704811096,
|
17340 |
+
"learning_rate": 1.0809353376282511e-05,
|
17341 |
+
"loss": 0.6455,
|
17342 |
+
"mean_token_accuracy": 0.8690215855836868,
|
17343 |
+
"num_tokens": 31990678.0,
|
17344 |
+
"step": 19260
|
17345 |
+
},
|
17346 |
+
{
|
17347 |
+
"epoch": 0.9196005678903352,
|
17348 |
+
"grad_norm": 0.32773557305336,
|
17349 |
+
"learning_rate": 1.080458124552613e-05,
|
17350 |
+
"loss": 0.669,
|
17351 |
+
"mean_token_accuracy": 0.870218101143837,
|
17352 |
+
"num_tokens": 32007764.0,
|
17353 |
+
"step": 19270
|
17354 |
+
},
|
17355 |
+
{
|
17356 |
+
"epoch": 0.9200777866593494,
|
17357 |
+
"grad_norm": 0.4381488561630249,
|
17358 |
+
"learning_rate": 1.0799809114769746e-05,
|
17359 |
+
"loss": 0.6339,
|
17360 |
+
"mean_token_accuracy": 0.8728301003575325,
|
17361 |
+
"num_tokens": 32024411.0,
|
17362 |
+
"step": 19280
|
17363 |
+
},
|
17364 |
+
{
|
17365 |
+
"epoch": 0.9205550054283635,
|
17366 |
+
"grad_norm": 0.4450734257698059,
|
17367 |
+
"learning_rate": 1.0795036984013363e-05,
|
17368 |
+
"loss": 0.777,
|
17369 |
+
"mean_token_accuracy": 0.8429723799228668,
|
17370 |
+
"num_tokens": 32043443.0,
|
17371 |
+
"step": 19290
|
17372 |
+
},
|
17373 |
+
{
|
17374 |
+
"epoch": 0.9210322241973777,
|
17375 |
+
"grad_norm": 0.31893327832221985,
|
17376 |
+
"learning_rate": 1.079026485325698e-05,
|
17377 |
+
"loss": 0.7236,
|
17378 |
+
"mean_token_accuracy": 0.859081144630909,
|
17379 |
+
"num_tokens": 32062229.0,
|
17380 |
+
"step": 19300
|
17381 |
+
},
|
17382 |
+
{
|
17383 |
+
"epoch": 0.9215094429663919,
|
17384 |
+
"grad_norm": 0.3973105251789093,
|
17385 |
+
"learning_rate": 1.0785492722500596e-05,
|
17386 |
+
"loss": 0.5872,
|
17387 |
+
"mean_token_accuracy": 0.871922855079174,
|
17388 |
+
"num_tokens": 32079383.0,
|
17389 |
+
"step": 19310
|
17390 |
+
},
|
17391 |
+
{
|
17392 |
+
"epoch": 0.921986661735406,
|
17393 |
+
"grad_norm": 0.295210063457489,
|
17394 |
+
"learning_rate": 1.0780720591744213e-05,
|
17395 |
+
"loss": 0.5652,
|
17396 |
+
"mean_token_accuracy": 0.8859059333801269,
|
17397 |
+
"num_tokens": 32093751.0,
|
17398 |
+
"step": 19320
|
17399 |
+
},
|
17400 |
+
{
|
17401 |
+
"epoch": 0.9224638805044202,
|
17402 |
+
"grad_norm": 0.3628122806549072,
|
17403 |
+
"learning_rate": 1.0775948460987833e-05,
|
17404 |
+
"loss": 0.6767,
|
17405 |
+
"mean_token_accuracy": 0.8768541231751442,
|
17406 |
+
"num_tokens": 32109295.0,
|
17407 |
+
"step": 19330
|
17408 |
+
},
|
17409 |
+
{
|
17410 |
+
"epoch": 0.9229410992734344,
|
17411 |
+
"grad_norm": 0.3489735424518585,
|
17412 |
+
"learning_rate": 1.077117633023145e-05,
|
17413 |
+
"loss": 0.7395,
|
17414 |
+
"mean_token_accuracy": 0.8563176363706588,
|
17415 |
+
"num_tokens": 32127172.0,
|
17416 |
+
"step": 19340
|
17417 |
+
},
|
17418 |
+
{
|
17419 |
+
"epoch": 0.9234183180424486,
|
17420 |
+
"grad_norm": 0.5454393625259399,
|
17421 |
+
"learning_rate": 1.0766404199475067e-05,
|
17422 |
+
"loss": 0.7232,
|
17423 |
+
"mean_token_accuracy": 0.8605073913931847,
|
17424 |
+
"num_tokens": 32145650.0,
|
17425 |
+
"step": 19350
|
17426 |
+
},
|
17427 |
+
{
|
17428 |
+
"epoch": 0.9238955368114627,
|
17429 |
+
"grad_norm": 0.5151296854019165,
|
17430 |
+
"learning_rate": 1.0761632068718683e-05,
|
17431 |
+
"loss": 0.7183,
|
17432 |
+
"mean_token_accuracy": 0.8582200676202774,
|
17433 |
+
"num_tokens": 32163210.0,
|
17434 |
+
"step": 19360
|
17435 |
+
},
|
17436 |
+
{
|
17437 |
+
"epoch": 0.9243727555804769,
|
17438 |
+
"grad_norm": 0.4276362359523773,
|
17439 |
+
"learning_rate": 1.07568599379623e-05,
|
17440 |
+
"loss": 0.617,
|
17441 |
+
"mean_token_accuracy": 0.8801989361643792,
|
17442 |
+
"num_tokens": 32179260.0,
|
17443 |
+
"step": 19370
|
17444 |
+
},
|
17445 |
+
{
|
17446 |
+
"epoch": 0.9248499743494911,
|
17447 |
+
"grad_norm": 0.3694617450237274,
|
17448 |
+
"learning_rate": 1.0752087807205918e-05,
|
17449 |
+
"loss": 0.6555,
|
17450 |
+
"mean_token_accuracy": 0.8701679021120071,
|
17451 |
+
"num_tokens": 32196081.0,
|
17452 |
+
"step": 19380
|
17453 |
+
},
|
17454 |
+
{
|
17455 |
+
"epoch": 0.9253271931185053,
|
17456 |
+
"grad_norm": 0.34691864252090454,
|
17457 |
+
"learning_rate": 1.0747315676449535e-05,
|
17458 |
+
"loss": 0.6075,
|
17459 |
+
"mean_token_accuracy": 0.886542621254921,
|
17460 |
+
"num_tokens": 32212271.0,
|
17461 |
+
"step": 19390
|
17462 |
+
},
|
17463 |
+
{
|
17464 |
+
"epoch": 0.9258044118875195,
|
17465 |
+
"grad_norm": 0.31945309042930603,
|
17466 |
+
"learning_rate": 1.0742543545693153e-05,
|
17467 |
+
"loss": 0.5085,
|
17468 |
+
"mean_token_accuracy": 0.9026155725121499,
|
17469 |
+
"num_tokens": 32227305.0,
|
17470 |
+
"step": 19400
|
17471 |
+
},
|
17472 |
+
{
|
17473 |
+
"epoch": 0.9262816306565337,
|
17474 |
+
"grad_norm": 0.3226480782032013,
|
17475 |
+
"learning_rate": 1.073777141493677e-05,
|
17476 |
+
"loss": 0.6435,
|
17477 |
+
"mean_token_accuracy": 0.8802873864769936,
|
17478 |
+
"num_tokens": 32243276.0,
|
17479 |
+
"step": 19410
|
17480 |
+
},
|
17481 |
+
{
|
17482 |
+
"epoch": 0.9267588494255479,
|
17483 |
+
"grad_norm": 0.44026854634284973,
|
17484 |
+
"learning_rate": 1.0732999284180388e-05,
|
17485 |
+
"loss": 0.622,
|
17486 |
+
"mean_token_accuracy": 0.8743364945054054,
|
17487 |
+
"num_tokens": 32260578.0,
|
17488 |
+
"step": 19420
|
17489 |
+
},
|
17490 |
+
{
|
17491 |
+
"epoch": 0.9272360681945621,
|
17492 |
+
"grad_norm": 0.29511240124702454,
|
17493 |
+
"learning_rate": 1.0728227153424005e-05,
|
17494 |
+
"loss": 0.6461,
|
17495 |
+
"mean_token_accuracy": 0.8643220156431198,
|
17496 |
+
"num_tokens": 32277850.0,
|
17497 |
+
"step": 19430
|
17498 |
+
},
|
17499 |
+
{
|
17500 |
+
"epoch": 0.9277132869635762,
|
17501 |
+
"grad_norm": 0.3299635946750641,
|
17502 |
+
"learning_rate": 1.0723455022667622e-05,
|
17503 |
+
"loss": 0.6406,
|
17504 |
+
"mean_token_accuracy": 0.8653289705514908,
|
17505 |
+
"num_tokens": 32295838.0,
|
17506 |
+
"step": 19440
|
17507 |
+
},
|
17508 |
+
{
|
17509 |
+
"epoch": 0.9281905057325904,
|
17510 |
+
"grad_norm": 0.3476797044277191,
|
17511 |
+
"learning_rate": 1.0718682891911238e-05,
|
17512 |
+
"loss": 0.6175,
|
17513 |
+
"mean_token_accuracy": 0.8700507491827011,
|
17514 |
+
"num_tokens": 32312699.0,
|
17515 |
+
"step": 19450
|
17516 |
+
},
|
17517 |
+
{
|
17518 |
+
"epoch": 0.9286677245016046,
|
17519 |
+
"grad_norm": 0.4377439320087433,
|
17520 |
+
"learning_rate": 1.0713910761154858e-05,
|
17521 |
+
"loss": 0.5511,
|
17522 |
+
"mean_token_accuracy": 0.8886492669582366,
|
17523 |
+
"num_tokens": 32328353.0,
|
17524 |
+
"step": 19460
|
17525 |
+
},
|
17526 |
+
{
|
17527 |
+
"epoch": 0.9291449432706188,
|
17528 |
+
"grad_norm": 0.41651830077171326,
|
17529 |
+
"learning_rate": 1.0709138630398475e-05,
|
17530 |
+
"loss": 0.6338,
|
17531 |
+
"mean_token_accuracy": 0.8774713531136513,
|
17532 |
+
"num_tokens": 32343869.0,
|
17533 |
+
"step": 19470
|
17534 |
+
},
|
17535 |
+
{
|
17536 |
+
"epoch": 0.929622162039633,
|
17537 |
+
"grad_norm": 0.45593252778053284,
|
17538 |
+
"learning_rate": 1.0704366499642092e-05,
|
17539 |
+
"loss": 0.6098,
|
17540 |
+
"mean_token_accuracy": 0.8867579936981201,
|
17541 |
+
"num_tokens": 32360245.0,
|
17542 |
+
"step": 19480
|
17543 |
+
},
|
17544 |
+
{
|
17545 |
+
"epoch": 0.9300993808086472,
|
17546 |
+
"grad_norm": 0.5481681227684021,
|
17547 |
+
"learning_rate": 1.0699594368885708e-05,
|
17548 |
+
"loss": 0.7036,
|
17549 |
+
"mean_token_accuracy": 0.8724937483668327,
|
17550 |
+
"num_tokens": 32376506.0,
|
17551 |
+
"step": 19490
|
17552 |
+
},
|
17553 |
+
{
|
17554 |
+
"epoch": 0.9305765995776614,
|
17555 |
+
"grad_norm": 0.4363495409488678,
|
17556 |
+
"learning_rate": 1.0694822238129325e-05,
|
17557 |
+
"loss": 0.6826,
|
17558 |
+
"mean_token_accuracy": 0.8660887077450752,
|
17559 |
+
"num_tokens": 32394083.0,
|
17560 |
+
"step": 19500
|
17561 |
}
|
17562 |
],
|
17563 |
"logging_steps": 10,
|
|
|
17577 |
"attributes": {}
|
17578 |
}
|
17579 |
},
|
17580 |
+
"total_flos": 7.29501813251924e+17,
|
17581 |
"train_batch_size": 2,
|
17582 |
"trial_name": null,
|
17583 |
"trial_params": null
|