Training in progress, step 2500, checkpoint

Browse files

Files changed (5) hide show

last-checkpoint/model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +370 -4

last-checkpoint/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:39b65f1f5bdc8e68c678fe87b09e42b54b9eba1640a82fd33222297da0ee47b3
 size 2066752

 version https://git-lfs.github.com/spec/v1
+oid sha256:4156994b0d2538aa2d37af1880314c638cc4d24a008e661dbcf7a289e3478dca
 size 2066752

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ef4e35f5dcad96cb1089fe9a38ce911a58e849647e472ddf5a739d5e6986aa33
 size 2162798

 version https://git-lfs.github.com/spec/v1
+oid sha256:fba8dbfefb423350e220e5a13f8bccbed1d56b9ffd0aa6cc568c8e5687f191cf
 size 2162798

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:19ab3d6cfcb43de67f16e412d0cb4f86309db602f8242d16f2b203a0212d6cbb
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:97474a95cf2d0e6166f036d8937e33ebebb2adb23cf1177f88edc10dc549c905
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c88b3aeb8ec2bf995149291b90b69667d3f268ff2f13afbeab1a220b8cc27590
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:7b204b20cdc92a140e2e21e015bdaa04af008c00e0bde30e59edf0f23817a338
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.5918910920390648,
   "eval_steps": 200,
-  "global_step": 2000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -1495,6 +1495,372 @@
       "eval_samples_per_second": 94.292,
       "eval_steps_per_second": 23.604,
       "step": 2000
     }
   ],
   "logging_steps": 10,
@@ -1509,12 +1875,12 @@
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
-        "should_training_stop": false
       },
       "attributes": {}
     }
   },
-  "total_flos": 51272680734720.0,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.7398638650488311,
   "eval_steps": 200,
+  "global_step": 2500,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 94.292,
       "eval_steps_per_second": 23.604,
       "step": 2000
+    },
+    {
+      "epoch": 0.5948505474992601,
+      "grad_norm": 0.55078125,
+      "learning_rate": 2.0280410844810428e-05,
+      "loss": 8.6746,
+      "step": 2010
+    },
+    {
+      "epoch": 0.5978100029594554,
+      "grad_norm": 0.490234375,
+      "learning_rate": 1.9488821249060297e-05,
+      "loss": 8.7101,
+      "step": 2020
+    },
+    {
+      "epoch": 0.6007694584196508,
+      "grad_norm": 0.546875,
+      "learning_rate": 1.871131877836879e-05,
+      "loss": 8.6891,
+      "step": 2030
+    },
+    {
+      "epoch": 0.6037289138798461,
+      "grad_norm": 0.60546875,
+      "learning_rate": 1.7948039473155554e-05,
+      "loss": 8.6504,
+      "step": 2040
+    },
+    {
+      "epoch": 0.6066883693400414,
+      "grad_norm": 1.0390625,
+      "learning_rate": 1.7199116885197995e-05,
+      "loss": 8.7523,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6096478248002367,
+      "grad_norm": 0.4921875,
+      "learning_rate": 1.646468205426377e-05,
+      "loss": 8.6832,
+      "step": 2060
+    },
+    {
+      "epoch": 0.6126072802604321,
+      "grad_norm": 0.466796875,
+      "learning_rate": 1.5744863485182537e-05,
+      "loss": 8.7104,
+      "step": 2070
+    },
+    {
+      "epoch": 0.6155667357206274,
+      "grad_norm": 0.55859375,
+      "learning_rate": 1.5039787125361326e-05,
+      "loss": 8.6838,
+      "step": 2080
+    },
+    {
+      "epoch": 0.6185261911808227,
+      "grad_norm": 0.7421875,
+      "learning_rate": 1.4349576342747462e-05,
+      "loss": 8.7315,
+      "step": 2090
+    },
+    {
+      "epoch": 0.621485646641018,
+      "grad_norm": 1.2109375,
+      "learning_rate": 1.3674351904242611e-05,
+      "loss": 8.622,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6244451021012134,
+      "grad_norm": 0.4765625,
+      "learning_rate": 1.3014231954572287e-05,
+      "loss": 8.6708,
+      "step": 2110
+    },
+    {
+      "epoch": 0.6274045575614087,
+      "grad_norm": 0.6171875,
+      "learning_rate": 1.2369331995613665e-05,
+      "loss": 8.6699,
+      "step": 2120
+    },
+    {
+      "epoch": 0.630364013021604,
+      "grad_norm": 0.56640625,
+      "learning_rate": 1.173976486618631e-05,
+      "loss": 8.6815,
+      "step": 2130
+    },
+    {
+      "epoch": 0.6333234684817993,
+      "grad_norm": 0.69921875,
+      "learning_rate": 1.1125640722308628e-05,
+      "loss": 8.7564,
+      "step": 2140
+    },
+    {
+      "epoch": 0.6362829239419947,
+      "grad_norm": 1.1328125,
+      "learning_rate": 1.0527067017923654e-05,
+      "loss": 8.6414,
+      "step": 2150
+    },
+    {
+      "epoch": 0.63924237940219,
+      "grad_norm": 0.4609375,
+      "learning_rate": 9.944148486097793e-06,
+      "loss": 8.7073,
+      "step": 2160
+    },
+    {
+      "epoch": 0.6422018348623854,
+      "grad_norm": 0.45703125,
+      "learning_rate": 9.376987120695545e-06,
+      "loss": 8.6823,
+      "step": 2170
+    },
+    {
+      "epoch": 0.6451612903225806,
+      "grad_norm": 0.56640625,
+      "learning_rate": 8.825682158533554e-06,
+      "loss": 8.6332,
+      "step": 2180
+    },
+    {
+      "epoch": 0.648120745782776,
+      "grad_norm": 0.6953125,
+      "learning_rate": 8.290330062017016e-06,
+      "loss": 8.6951,
+      "step": 2190
+    },
+    {
+      "epoch": 0.6510802012429713,
+      "grad_norm": 0.95703125,
+      "learning_rate": 7.771024502261526e-06,
+      "loss": 8.7376,
+      "step": 2200
+    },
+    {
+      "epoch": 0.6510802012429713,
+      "eval_loss": 8.695413589477539,
+      "eval_runtime": 18.9893,
+      "eval_samples_per_second": 79.097,
+      "eval_steps_per_second": 19.801,
+      "step": 2200
+    },
+    {
+      "epoch": 0.6540396567031667,
+      "grad_norm": 0.455078125,
+      "learning_rate": 7.267856342703461e-06,
+      "loss": 8.714,
+      "step": 2210
+    },
+    {
+      "epoch": 0.6569991121633619,
+      "grad_norm": 0.52734375,
+      "learning_rate": 6.780913623201346e-06,
+      "loss": 8.6495,
+      "step": 2220
+    },
+    {
+      "epoch": 0.6599585676235573,
+      "grad_norm": 0.61328125,
+      "learning_rate": 6.310281544631546e-06,
+      "loss": 8.7043,
+      "step": 2230
+    },
+    {
+      "epoch": 0.6629180230837526,
+      "grad_norm": 0.62890625,
+      "learning_rate": 5.856042453980526e-06,
+      "loss": 8.6877,
+      "step": 2240
+    },
+    {
+      "epoch": 0.665877478543948,
+      "grad_norm": 1.25,
+      "learning_rate": 5.418275829936537e-06,
+      "loss": 8.6366,
+      "step": 2250
+    },
+    {
+      "epoch": 0.6688369340041432,
+      "grad_norm": 0.470703125,
+      "learning_rate": 4.997058268983135e-06,
+      "loss": 8.7548,
+      "step": 2260
+    },
+    {
+      "epoch": 0.6717963894643386,
+      "grad_norm": 0.53515625,
+      "learning_rate": 4.592463471997022e-06,
+      "loss": 8.6756,
+      "step": 2270
+    },
+    {
+      "epoch": 0.6747558449245339,
+      "grad_norm": 0.482421875,
+      "learning_rate": 4.204562231352516e-06,
+      "loss": 8.6466,
+      "step": 2280
+    },
+    {
+      "epoch": 0.6777153003847292,
+      "grad_norm": 0.58203125,
+      "learning_rate": 3.83342241853496e-06,
+      "loss": 8.6794,
+      "step": 2290
+    },
+    {
+      "epoch": 0.6806747558449245,
+      "grad_norm": 1.171875,
+      "learning_rate": 3.4791089722651436e-06,
+      "loss": 8.6301,
+      "step": 2300
+    },
+    {
+      "epoch": 0.6836342113051198,
+      "grad_norm": 0.546875,
+      "learning_rate": 3.1416838871368924e-06,
+      "loss": 8.6403,
+      "step": 2310
+    },
+    {
+      "epoch": 0.6865936667653152,
+      "grad_norm": 0.5,
+      "learning_rate": 2.821206202769899e-06,
+      "loss": 8.7124,
+      "step": 2320
+    },
+    {
+      "epoch": 0.6895531222255105,
+      "grad_norm": 0.765625,
+      "learning_rate": 2.5177319934794e-06,
+      "loss": 8.6445,
+      "step": 2330
+    },
+    {
+      "epoch": 0.6925125776857058,
+      "grad_norm": 0.6328125,
+      "learning_rate": 2.2313143584648423e-06,
+      "loss": 8.6972,
+      "step": 2340
+    },
+    {
+      "epoch": 0.6954720331459011,
+      "grad_norm": 1.0625,
+      "learning_rate": 1.9620034125190644e-06,
+      "loss": 8.9133,
+      "step": 2350
+    },
+    {
+      "epoch": 0.6984314886060965,
+      "grad_norm": 0.44921875,
+      "learning_rate": 1.7098462772596302e-06,
+      "loss": 8.6366,
+      "step": 2360
+    },
+    {
+      "epoch": 0.7013909440662918,
+      "grad_norm": 0.51953125,
+      "learning_rate": 1.4748870728839347e-06,
+      "loss": 8.6377,
+      "step": 2370
+    },
+    {
+      "epoch": 0.7043503995264871,
+      "grad_norm": 0.470703125,
+      "learning_rate": 1.2571669104494256e-06,
+      "loss": 8.6656,
+      "step": 2380
+    },
+    {
+      "epoch": 0.7073098549866824,
+      "grad_norm": 0.71484375,
+      "learning_rate": 1.0567238846803996e-06,
+      "loss": 8.6525,
+      "step": 2390
+    },
+    {
+      "epoch": 0.7102693104468778,
+      "grad_norm": 0.7421875,
+      "learning_rate": 8.735930673024806e-07,
+      "loss": 8.573,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7102693104468778,
+      "eval_loss": 8.698899269104004,
+      "eval_runtime": 10.0141,
+      "eval_samples_per_second": 149.989,
+      "eval_steps_per_second": 37.547,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7132287659070731,
+      "grad_norm": 0.4921875,
+      "learning_rate": 7.078065009060941e-07,
+      "loss": 8.6492,
+      "step": 2410
+    },
+    {
+      "epoch": 0.7161882213672685,
+      "grad_norm": 0.4765625,
+      "learning_rate": 5.593931933399854e-07,
+      "loss": 8.6976,
+      "step": 2420
+    },
+    {
+      "epoch": 0.7191476768274637,
+      "grad_norm": 0.515625,
+      "learning_rate": 4.2837911263562404e-07,
+      "loss": 8.6823,
+      "step": 2430
+    },
+    {
+      "epoch": 0.7221071322876591,
+      "grad_norm": 0.6875,
+      "learning_rate": 3.1478718246357173e-07,
+      "loss": 8.7054,
+      "step": 2440
+    },
+    {
+      "epoch": 0.7250665877478544,
+      "grad_norm": 1.28125,
+      "learning_rate": 2.1863727812254653e-07,
+      "loss": 8.81,
+      "step": 2450
+    },
+    {
+      "epoch": 0.7280260432080498,
+      "grad_norm": 0.427734375,
+      "learning_rate": 1.3994622306173765e-07,
+      "loss": 8.6948,
+      "step": 2460
+    },
+    {
+      "epoch": 0.730985498668245,
+      "grad_norm": 0.48046875,
+      "learning_rate": 7.872778593728258e-08,
+      "loss": 8.6751,
+      "step": 2470
+    },
+    {
+      "epoch": 0.7339449541284404,
+      "grad_norm": 0.59375,
+      "learning_rate": 3.499267820307184e-08,
+      "loss": 8.7253,
+      "step": 2480
+    },
+    {
+      "epoch": 0.7369044095886357,
+      "grad_norm": 0.60546875,
+      "learning_rate": 8.748552236603757e-09,
+      "loss": 8.7342,
+      "step": 2490
+    },
+    {
+      "epoch": 0.7398638650488311,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0,
+      "loss": 8.645,
+      "step": 2500
     }
   ],
   "logging_steps": 10,
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
+        "should_training_stop": true
       },
       "attributes": {}
     }
   },
+  "total_flos": 64161175044096.0,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null