Training in progress, step 13800, checkpoint
Browse files
.gitattributes
CHANGED
@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
37 |
checkpoint-13800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
37 |
checkpoint-13800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
38 |
+
last-checkpoint/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
last-checkpoint/adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 3541119728
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c0a97ad14fc05d45b0bcdd00fe3398b94c48a3fba262343ce5b929ae2698e50d
|
3 |
size 3541119728
|
last-checkpoint/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 778374186
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:daa42de180142ef371b2befa10b05a6c991d650216eca013dca907dc8c2a9a76
|
3 |
size 778374186
|
last-checkpoint/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5d529b3ead19a0f7e903fee329286bdbb85e6ac6fdf18146d635bf8003ed8ece
|
3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
@@ -2,9 +2,9 @@
|
|
2 |
"best_global_step": null,
|
3 |
"best_metric": null,
|
4 |
"best_model_checkpoint": null,
|
5 |
-
"epoch": 0.
|
6 |
"eval_steps": 500,
|
7 |
-
"global_step":
|
8 |
"is_hyper_param_search": false,
|
9 |
"is_local_process_zero": true,
|
10 |
"is_world_process_zero": true,
|
@@ -12158,6 +12158,276 @@
|
|
12158 |
"mean_token_accuracy": 0.8752416774630547,
|
12159 |
"num_tokens": 22423922.0,
|
12160 |
"step": 13500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12161 |
}
|
12162 |
],
|
12163 |
"logging_steps": 10,
|
@@ -12177,7 +12447,7 @@
|
|
12177 |
"attributes": {}
|
12178 |
}
|
12179 |
},
|
12180 |
-
"total_flos": 5.
|
12181 |
"train_batch_size": 2,
|
12182 |
"trial_name": null,
|
12183 |
"trial_params": null
|
|
|
2 |
"best_global_step": null,
|
3 |
"best_metric": null,
|
4 |
"best_model_checkpoint": null,
|
5 |
+
"epoch": 0.6585619012395757,
|
6 |
"eval_steps": 500,
|
7 |
+
"global_step": 13800,
|
8 |
"is_hyper_param_search": false,
|
9 |
"is_local_process_zero": true,
|
10 |
"is_world_process_zero": true,
|
|
|
12158 |
"mean_token_accuracy": 0.8752416774630547,
|
12159 |
"num_tokens": 22423922.0,
|
12160 |
"step": 13500
|
12161 |
+
},
|
12162 |
+
{
|
12163 |
+
"epoch": 0.6447225569381644,
|
12164 |
+
"grad_norm": 0.4227236807346344,
|
12165 |
+
"learning_rate": 1.3553328561202578e-05,
|
12166 |
+
"loss": 0.7471,
|
12167 |
+
"mean_token_accuracy": 0.8619314864277839,
|
12168 |
+
"num_tokens": 22442654.0,
|
12169 |
+
"step": 13510
|
12170 |
+
},
|
12171 |
+
{
|
12172 |
+
"epoch": 0.6451997757071786,
|
12173 |
+
"grad_norm": 0.4778737425804138,
|
12174 |
+
"learning_rate": 1.3548556430446197e-05,
|
12175 |
+
"loss": 0.6241,
|
12176 |
+
"mean_token_accuracy": 0.8725541770458222,
|
12177 |
+
"num_tokens": 22458565.0,
|
12178 |
+
"step": 13520
|
12179 |
+
},
|
12180 |
+
{
|
12181 |
+
"epoch": 0.6456769944761928,
|
12182 |
+
"grad_norm": 0.3292141854763031,
|
12183 |
+
"learning_rate": 1.3543784299689813e-05,
|
12184 |
+
"loss": 0.6019,
|
12185 |
+
"mean_token_accuracy": 0.8786483362317086,
|
12186 |
+
"num_tokens": 22475131.0,
|
12187 |
+
"step": 13530
|
12188 |
+
},
|
12189 |
+
{
|
12190 |
+
"epoch": 0.6461542132452069,
|
12191 |
+
"grad_norm": 0.3959347605705261,
|
12192 |
+
"learning_rate": 1.353901216893343e-05,
|
12193 |
+
"loss": 0.5363,
|
12194 |
+
"mean_token_accuracy": 0.8921607866883278,
|
12195 |
+
"num_tokens": 22490344.0,
|
12196 |
+
"step": 13540
|
12197 |
+
},
|
12198 |
+
{
|
12199 |
+
"epoch": 0.6466314320142211,
|
12200 |
+
"grad_norm": 0.3481472134590149,
|
12201 |
+
"learning_rate": 1.3534240038177047e-05,
|
12202 |
+
"loss": 0.608,
|
12203 |
+
"mean_token_accuracy": 0.8803831622004509,
|
12204 |
+
"num_tokens": 22505993.0,
|
12205 |
+
"step": 13550
|
12206 |
+
},
|
12207 |
+
{
|
12208 |
+
"epoch": 0.6471086507832353,
|
12209 |
+
"grad_norm": 0.3353317081928253,
|
12210 |
+
"learning_rate": 1.3529467907420663e-05,
|
12211 |
+
"loss": 0.6797,
|
12212 |
+
"mean_token_accuracy": 0.8620204761624336,
|
12213 |
+
"num_tokens": 22523092.0,
|
12214 |
+
"step": 13560
|
12215 |
+
},
|
12216 |
+
{
|
12217 |
+
"epoch": 0.6475858695522495,
|
12218 |
+
"grad_norm": 0.33590102195739746,
|
12219 |
+
"learning_rate": 1.3524695776664283e-05,
|
12220 |
+
"loss": 0.6826,
|
12221 |
+
"mean_token_accuracy": 0.8625424951314926,
|
12222 |
+
"num_tokens": 22540022.0,
|
12223 |
+
"step": 13570
|
12224 |
+
},
|
12225 |
+
{
|
12226 |
+
"epoch": 0.6480630883212637,
|
12227 |
+
"grad_norm": 0.3883362412452698,
|
12228 |
+
"learning_rate": 1.35199236459079e-05,
|
12229 |
+
"loss": 0.7065,
|
12230 |
+
"mean_token_accuracy": 0.8587613448500633,
|
12231 |
+
"num_tokens": 22557276.0,
|
12232 |
+
"step": 13580
|
12233 |
+
},
|
12234 |
+
{
|
12235 |
+
"epoch": 0.6485403070902779,
|
12236 |
+
"grad_norm": 0.329208642244339,
|
12237 |
+
"learning_rate": 1.3515151515151517e-05,
|
12238 |
+
"loss": 0.6851,
|
12239 |
+
"mean_token_accuracy": 0.8579544991254806,
|
12240 |
+
"num_tokens": 22575264.0,
|
12241 |
+
"step": 13590
|
12242 |
+
},
|
12243 |
+
{
|
12244 |
+
"epoch": 0.6490175258592921,
|
12245 |
+
"grad_norm": 0.3257433772087097,
|
12246 |
+
"learning_rate": 1.3510379384395133e-05,
|
12247 |
+
"loss": 0.6501,
|
12248 |
+
"mean_token_accuracy": 0.8753771096467972,
|
12249 |
+
"num_tokens": 22592298.0,
|
12250 |
+
"step": 13600
|
12251 |
+
},
|
12252 |
+
{
|
12253 |
+
"epoch": 0.6494947446283063,
|
12254 |
+
"grad_norm": 0.319042444229126,
|
12255 |
+
"learning_rate": 1.350560725363875e-05,
|
12256 |
+
"loss": 0.6453,
|
12257 |
+
"mean_token_accuracy": 0.8721793726086616,
|
12258 |
+
"num_tokens": 22609547.0,
|
12259 |
+
"step": 13610
|
12260 |
+
},
|
12261 |
+
{
|
12262 |
+
"epoch": 0.6499719633973204,
|
12263 |
+
"grad_norm": 0.34079188108444214,
|
12264 |
+
"learning_rate": 1.3500835122882368e-05,
|
12265 |
+
"loss": 0.7824,
|
12266 |
+
"mean_token_accuracy": 0.8555491074919701,
|
12267 |
+
"num_tokens": 22628228.0,
|
12268 |
+
"step": 13620
|
12269 |
+
},
|
12270 |
+
{
|
12271 |
+
"epoch": 0.6504491821663346,
|
12272 |
+
"grad_norm": 0.45218825340270996,
|
12273 |
+
"learning_rate": 1.3496062992125985e-05,
|
12274 |
+
"loss": 0.634,
|
12275 |
+
"mean_token_accuracy": 0.8678511619567871,
|
12276 |
+
"num_tokens": 22643854.0,
|
12277 |
+
"step": 13630
|
12278 |
+
},
|
12279 |
+
{
|
12280 |
+
"epoch": 0.6509264009353488,
|
12281 |
+
"grad_norm": 0.2865401804447174,
|
12282 |
+
"learning_rate": 1.3491290861369603e-05,
|
12283 |
+
"loss": 0.6747,
|
12284 |
+
"mean_token_accuracy": 0.8544631570577621,
|
12285 |
+
"num_tokens": 22661164.0,
|
12286 |
+
"step": 13640
|
12287 |
+
},
|
12288 |
+
{
|
12289 |
+
"epoch": 0.651403619704363,
|
12290 |
+
"grad_norm": 0.4217221736907959,
|
12291 |
+
"learning_rate": 1.348651873061322e-05,
|
12292 |
+
"loss": 0.5285,
|
12293 |
+
"mean_token_accuracy": 0.8921225979924202,
|
12294 |
+
"num_tokens": 22676050.0,
|
12295 |
+
"step": 13650
|
12296 |
+
},
|
12297 |
+
{
|
12298 |
+
"epoch": 0.6518808384733772,
|
12299 |
+
"grad_norm": 0.4127669334411621,
|
12300 |
+
"learning_rate": 1.3481746599856837e-05,
|
12301 |
+
"loss": 0.5793,
|
12302 |
+
"mean_token_accuracy": 0.886702474951744,
|
12303 |
+
"num_tokens": 22691960.0,
|
12304 |
+
"step": 13660
|
12305 |
+
},
|
12306 |
+
{
|
12307 |
+
"epoch": 0.6523580572423914,
|
12308 |
+
"grad_norm": 0.3422595262527466,
|
12309 |
+
"learning_rate": 1.3476974469100455e-05,
|
12310 |
+
"loss": 0.6475,
|
12311 |
+
"mean_token_accuracy": 0.8713901385664939,
|
12312 |
+
"num_tokens": 22707891.0,
|
12313 |
+
"step": 13670
|
12314 |
+
},
|
12315 |
+
{
|
12316 |
+
"epoch": 0.6528352760114056,
|
12317 |
+
"grad_norm": 0.4279707372188568,
|
12318 |
+
"learning_rate": 1.3472202338344072e-05,
|
12319 |
+
"loss": 0.7561,
|
12320 |
+
"mean_token_accuracy": 0.8528080299496651,
|
12321 |
+
"num_tokens": 22726155.0,
|
12322 |
+
"step": 13680
|
12323 |
+
},
|
12324 |
+
{
|
12325 |
+
"epoch": 0.6533124947804198,
|
12326 |
+
"grad_norm": 0.3606453239917755,
|
12327 |
+
"learning_rate": 1.3467430207587688e-05,
|
12328 |
+
"loss": 0.7429,
|
12329 |
+
"mean_token_accuracy": 0.864837720990181,
|
12330 |
+
"num_tokens": 22744235.0,
|
12331 |
+
"step": 13690
|
12332 |
+
},
|
12333 |
+
{
|
12334 |
+
"epoch": 0.6537897135494339,
|
12335 |
+
"grad_norm": 0.38309189677238464,
|
12336 |
+
"learning_rate": 1.3462658076831305e-05,
|
12337 |
+
"loss": 0.6402,
|
12338 |
+
"mean_token_accuracy": 0.8674290254712105,
|
12339 |
+
"num_tokens": 22760300.0,
|
12340 |
+
"step": 13700
|
12341 |
+
},
|
12342 |
+
{
|
12343 |
+
"epoch": 0.6542669323184481,
|
12344 |
+
"grad_norm": 0.30889174342155457,
|
12345 |
+
"learning_rate": 1.3457885946074925e-05,
|
12346 |
+
"loss": 0.7359,
|
12347 |
+
"mean_token_accuracy": 0.8531943425536156,
|
12348 |
+
"num_tokens": 22778164.0,
|
12349 |
+
"step": 13710
|
12350 |
+
},
|
12351 |
+
{
|
12352 |
+
"epoch": 0.6547441510874623,
|
12353 |
+
"grad_norm": 0.3210035264492035,
|
12354 |
+
"learning_rate": 1.3453113815318542e-05,
|
12355 |
+
"loss": 0.5691,
|
12356 |
+
"mean_token_accuracy": 0.892583754658699,
|
12357 |
+
"num_tokens": 22792570.0,
|
12358 |
+
"step": 13720
|
12359 |
+
},
|
12360 |
+
{
|
12361 |
+
"epoch": 0.6552213698564765,
|
12362 |
+
"grad_norm": 0.2989923357963562,
|
12363 |
+
"learning_rate": 1.3448341684562158e-05,
|
12364 |
+
"loss": 0.5318,
|
12365 |
+
"mean_token_accuracy": 0.8902983129024505,
|
12366 |
+
"num_tokens": 22807960.0,
|
12367 |
+
"step": 13730
|
12368 |
+
},
|
12369 |
+
{
|
12370 |
+
"epoch": 0.6556985886254907,
|
12371 |
+
"grad_norm": 0.381619393825531,
|
12372 |
+
"learning_rate": 1.3443569553805775e-05,
|
12373 |
+
"loss": 0.717,
|
12374 |
+
"mean_token_accuracy": 0.8556675240397453,
|
12375 |
+
"num_tokens": 22826319.0,
|
12376 |
+
"step": 13740
|
12377 |
+
},
|
12378 |
+
{
|
12379 |
+
"epoch": 0.6561758073945049,
|
12380 |
+
"grad_norm": 0.33662042021751404,
|
12381 |
+
"learning_rate": 1.3438797423049392e-05,
|
12382 |
+
"loss": 0.6839,
|
12383 |
+
"mean_token_accuracy": 0.8628215402364731,
|
12384 |
+
"num_tokens": 22843310.0,
|
12385 |
+
"step": 13750
|
12386 |
+
},
|
12387 |
+
{
|
12388 |
+
"epoch": 0.6566530261635191,
|
12389 |
+
"grad_norm": 0.30493494868278503,
|
12390 |
+
"learning_rate": 1.3434025292293008e-05,
|
12391 |
+
"loss": 0.5978,
|
12392 |
+
"mean_token_accuracy": 0.8812342941761017,
|
12393 |
+
"num_tokens": 22858101.0,
|
12394 |
+
"step": 13760
|
12395 |
+
},
|
12396 |
+
{
|
12397 |
+
"epoch": 0.6571302449325332,
|
12398 |
+
"grad_norm": 0.4126700460910797,
|
12399 |
+
"learning_rate": 1.3429253161536627e-05,
|
12400 |
+
"loss": 0.652,
|
12401 |
+
"mean_token_accuracy": 0.8751346081495285,
|
12402 |
+
"num_tokens": 22874216.0,
|
12403 |
+
"step": 13770
|
12404 |
+
},
|
12405 |
+
{
|
12406 |
+
"epoch": 0.6576074637015474,
|
12407 |
+
"grad_norm": 0.3574364185333252,
|
12408 |
+
"learning_rate": 1.3424481030780245e-05,
|
12409 |
+
"loss": 0.6295,
|
12410 |
+
"mean_token_accuracy": 0.8706316411495209,
|
12411 |
+
"num_tokens": 22889521.0,
|
12412 |
+
"step": 13780
|
12413 |
+
},
|
12414 |
+
{
|
12415 |
+
"epoch": 0.6580846824705616,
|
12416 |
+
"grad_norm": 0.4885793924331665,
|
12417 |
+
"learning_rate": 1.3419708900023862e-05,
|
12418 |
+
"loss": 0.5987,
|
12419 |
+
"mean_token_accuracy": 0.8812028467655182,
|
12420 |
+
"num_tokens": 22907349.0,
|
12421 |
+
"step": 13790
|
12422 |
+
},
|
12423 |
+
{
|
12424 |
+
"epoch": 0.6585619012395757,
|
12425 |
+
"grad_norm": 0.33491700887680054,
|
12426 |
+
"learning_rate": 1.3414936769267479e-05,
|
12427 |
+
"loss": 0.6659,
|
12428 |
+
"mean_token_accuracy": 0.8643362104892731,
|
12429 |
+
"num_tokens": 22924082.0,
|
12430 |
+
"step": 13800
|
12431 |
}
|
12432 |
],
|
12433 |
"logging_steps": 10,
|
|
|
12447 |
"attributes": {}
|
12448 |
}
|
12449 |
},
|
12450 |
+
"total_flos": 5.161398467590963e+17,
|
12451 |
"train_batch_size": 2,
|
12452 |
"trial_name": null,
|
12453 |
"trial_params": null
|