End of training

Browse files

Files changed (6) hide show

README.md +19 -2
all_results.json +16 -0
eval_results.json +10 -0
runs/Jan03_21-06-41_node16/events.out.tfevents.1735907635.node16 +3 -0
train_results.json +9 -0
trainer_state.json +329 -0

README.md CHANGED Viewed

@@ -4,9 +4,23 @@ license: llama3.2
 base_model: meta-llama/Llama-3.2-1B-Instruct
 tags:
 - generated_from_trainer
 model-index:
 - name: Llama-3.2-1B-Instruct-sft_metamath
-  results: []
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -14,7 +28,10 @@ should probably proofread and complete it, then remove this comment. -->
 # Llama-3.2-1B-Instruct-sft_metamath
-This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) on an unknown dataset.
 ## Model description

 base_model: meta-llama/Llama-3.2-1B-Instruct
 tags:
 - generated_from_trainer
+datasets:
+- gohsyi/metamath-sft
+metrics:
+- accuracy
 model-index:
 - name: Llama-3.2-1B-Instruct-sft_metamath
+  results:
+  - task:
+      name: Causal Language Modeling
+      type: text-generation
+    dataset:
+      name: gohsyi/metamath-sft
+      type: gohsyi/metamath-sft
+    metrics:
+    - name: Accuracy
+      type: accuracy
+      value: 0.8814735253307663
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 # Llama-3.2-1B-Instruct-sft_metamath
+This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) on the gohsyi/metamath-sft dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.4330
+- Accuracy: 0.8815
 ## Model description

all_results.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "epoch": 2.9950650515926425,
+    "eval_accuracy": 0.8814735253307663,
+    "eval_loss": 0.433001846075058,
+    "eval_runtime": 49.6494,
+    "eval_samples": 3290,
+    "eval_samples_per_second": 66.265,
+    "eval_steps_per_second": 4.149,
+    "perplexity": 1.5418790671224933,
+    "total_flos": 1.1169746555638907e+18,
+    "train_loss": 0.5152784274350539,
+    "train_runtime": 8326.5034,
+    "train_samples": 62403,
+    "train_samples_per_second": 22.484,
+    "train_steps_per_second": 0.05
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "epoch": 2.9950650515926425,
+    "eval_accuracy": 0.8814735253307663,
+    "eval_loss": 0.433001846075058,
+    "eval_runtime": 49.6494,
+    "eval_samples": 3290,
+    "eval_samples_per_second": 66.265,
+    "eval_steps_per_second": 4.149,
+    "perplexity": 1.5418790671224933
+}

runs/Jan03_21-06-41_node16/events.out.tfevents.1735907635.node16 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e9c8514377bcbef5a0c02784df39c2f60af71700ceaf0aa6ce54887ad4545f40
+size 40

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 2.9950650515926425,
+    "total_flos": 1.1169746555638907e+18,
+    "train_loss": 0.5152784274350539,
+    "train_runtime": 8326.5034,
+    "train_samples": 62403,
+    "train_samples_per_second": 22.484,
+    "train_steps_per_second": 0.05
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,329 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.9950650515926425,
+  "eval_steps": 500,
+  "global_step": 417,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.07178106774338268,
+      "grad_norm": 0.46034833788871765,
+      "learning_rate": 2.380952380952381e-06,
+      "loss": 1.2735,
+      "step": 10
+    },
+    {
+      "epoch": 0.14356213548676536,
+      "grad_norm": 0.17795707285404205,
+      "learning_rate": 4.761904761904762e-06,
+      "loss": 0.9311,
+      "step": 20
+    },
+    {
+      "epoch": 0.21534320323014805,
+      "grad_norm": 0.07722567766904831,
+      "learning_rate": 7.1428571428571436e-06,
+      "loss": 0.7258,
+      "step": 30
+    },
+    {
+      "epoch": 0.2871242709735307,
+      "grad_norm": 0.058059412986040115,
+      "learning_rate": 9.523809523809525e-06,
+      "loss": 0.6562,
+      "step": 40
+    },
+    {
+      "epoch": 0.35890533871691344,
+      "grad_norm": 0.05688474327325821,
+      "learning_rate": 9.786666666666667e-06,
+      "loss": 0.6236,
+      "step": 50
+    },
+    {
+      "epoch": 0.4306864064602961,
+      "grad_norm": 0.0652860477566719,
+      "learning_rate": 9.52e-06,
+      "loss": 0.6011,
+      "step": 60
+    },
+    {
+      "epoch": 0.5024674742036788,
+      "grad_norm": 0.09465105831623077,
+      "learning_rate": 9.253333333333333e-06,
+      "loss": 0.5775,
+      "step": 70
+    },
+    {
+      "epoch": 0.5742485419470614,
+      "grad_norm": 0.16655535995960236,
+      "learning_rate": 8.986666666666666e-06,
+      "loss": 0.5588,
+      "step": 80
+    },
+    {
+      "epoch": 0.6460296096904441,
+      "grad_norm": 0.11725710332393646,
+      "learning_rate": 8.720000000000001e-06,
+      "loss": 0.537,
+      "step": 90
+    },
+    {
+      "epoch": 0.7178106774338269,
+      "grad_norm": 0.044058505445718765,
+      "learning_rate": 8.453333333333334e-06,
+      "loss": 0.5187,
+      "step": 100
+    },
+    {
+      "epoch": 0.7895917451772095,
+      "grad_norm": 0.022063592448830605,
+      "learning_rate": 8.186666666666667e-06,
+      "loss": 0.51,
+      "step": 110
+    },
+    {
+      "epoch": 0.8613728129205922,
+      "grad_norm": 0.021037070080637932,
+      "learning_rate": 7.92e-06,
+      "loss": 0.5043,
+      "step": 120
+    },
+    {
+      "epoch": 0.9331538806639749,
+      "grad_norm": 0.01985151134431362,
+      "learning_rate": 7.653333333333333e-06,
+      "loss": 0.4985,
+      "step": 130
+    },
+    {
+      "epoch": 1.0058322117541498,
+      "grad_norm": 0.04843816161155701,
+      "learning_rate": 7.386666666666667e-06,
+      "loss": 0.5345,
+      "step": 140
+    },
+    {
+      "epoch": 1.0776132794975326,
+      "grad_norm": 0.02092103101313114,
+      "learning_rate": 7.1200000000000004e-06,
+      "loss": 0.4855,
+      "step": 150
+    },
+    {
+      "epoch": 1.1493943472409152,
+      "grad_norm": 0.020286045968532562,
+      "learning_rate": 6.853333333333334e-06,
+      "loss": 0.4794,
+      "step": 160
+    },
+    {
+      "epoch": 1.221175414984298,
+      "grad_norm": 0.020133651793003082,
+      "learning_rate": 6.5866666666666666e-06,
+      "loss": 0.4779,
+      "step": 170
+    },
+    {
+      "epoch": 1.2929564827276807,
+      "grad_norm": 0.020597418770194054,
+      "learning_rate": 6.3200000000000005e-06,
+      "loss": 0.4738,
+      "step": 180
+    },
+    {
+      "epoch": 1.3647375504710633,
+      "grad_norm": 0.020543133839964867,
+      "learning_rate": 6.0533333333333335e-06,
+      "loss": 0.4713,
+      "step": 190
+    },
+    {
+      "epoch": 1.4365186182144458,
+      "grad_norm": 0.0203793253749609,
+      "learning_rate": 5.7866666666666674e-06,
+      "loss": 0.4665,
+      "step": 200
+    },
+    {
+      "epoch": 1.5082996859578286,
+      "grad_norm": 0.020146360620856285,
+      "learning_rate": 5.5200000000000005e-06,
+      "loss": 0.4627,
+      "step": 210
+    },
+    {
+      "epoch": 1.5800807537012114,
+      "grad_norm": 0.020714716985821724,
+      "learning_rate": 5.2533333333333336e-06,
+      "loss": 0.4609,
+      "step": 220
+    },
+    {
+      "epoch": 1.651861821444594,
+      "grad_norm": 0.020331306383013725,
+      "learning_rate": 4.986666666666667e-06,
+      "loss": 0.458,
+      "step": 230
+    },
+    {
+      "epoch": 1.7236428891879767,
+      "grad_norm": 0.019636554643511772,
+      "learning_rate": 4.7200000000000005e-06,
+      "loss": 0.4559,
+      "step": 240
+    },
+    {
+      "epoch": 1.7954239569313595,
+      "grad_norm": 0.020189929753541946,
+      "learning_rate": 4.453333333333334e-06,
+      "loss": 0.454,
+      "step": 250
+    },
+    {
+      "epoch": 1.867205024674742,
+      "grad_norm": 0.020626794546842575,
+      "learning_rate": 4.1866666666666675e-06,
+      "loss": 0.4507,
+      "step": 260
+    },
+    {
+      "epoch": 1.9389860924181246,
+      "grad_norm": 0.02100289985537529,
+      "learning_rate": 3.920000000000001e-06,
+      "loss": 0.4498,
+      "step": 270
+    },
+    {
+      "epoch": 2.0116644235082997,
+      "grad_norm": 0.020512910559773445,
+      "learning_rate": 3.6533333333333336e-06,
+      "loss": 0.4824,
+      "step": 280
+    },
+    {
+      "epoch": 2.083445491251682,
+      "grad_norm": 0.02125644125044346,
+      "learning_rate": 3.386666666666667e-06,
+      "loss": 0.4383,
+      "step": 290
+    },
+    {
+      "epoch": 2.155226558995065,
+      "grad_norm": 0.021306023001670837,
+      "learning_rate": 3.12e-06,
+      "loss": 0.4377,
+      "step": 300
+    },
+    {
+      "epoch": 2.2270076267384478,
+      "grad_norm": 0.020120656117796898,
+      "learning_rate": 2.8533333333333337e-06,
+      "loss": 0.4367,
+      "step": 310
+    },
+    {
+      "epoch": 2.2987886944818303,
+      "grad_norm": 0.020625969395041466,
+      "learning_rate": 2.5866666666666667e-06,
+      "loss": 0.4338,
+      "step": 320
+    },
+    {
+      "epoch": 2.3705697622252133,
+      "grad_norm": 0.020297806710004807,
+      "learning_rate": 2.3200000000000002e-06,
+      "loss": 0.4324,
+      "step": 330
+    },
+    {
+      "epoch": 2.442350829968596,
+      "grad_norm": 0.020711807534098625,
+      "learning_rate": 2.0533333333333337e-06,
+      "loss": 0.4317,
+      "step": 340
+    },
+    {
+      "epoch": 2.5141318977119784,
+      "grad_norm": 0.020219726487994194,
+      "learning_rate": 1.7866666666666668e-06,
+      "loss": 0.4311,
+      "step": 350
+    },
+    {
+      "epoch": 2.5859129654553614,
+      "grad_norm": 0.020259831100702286,
+      "learning_rate": 1.52e-06,
+      "loss": 0.4302,
+      "step": 360
+    },
+    {
+      "epoch": 2.657694033198744,
+      "grad_norm": 0.01994330622255802,
+      "learning_rate": 1.2533333333333333e-06,
+      "loss": 0.4277,
+      "step": 370
+    },
+    {
+      "epoch": 2.7294751009421265,
+      "grad_norm": 0.020670117810368538,
+      "learning_rate": 9.866666666666668e-07,
+      "loss": 0.4286,
+      "step": 380
+    },
+    {
+      "epoch": 2.801256168685509,
+      "grad_norm": 0.02084210328757763,
+      "learning_rate": 7.2e-07,
+      "loss": 0.4267,
+      "step": 390
+    },
+    {
+      "epoch": 2.8730372364288916,
+      "grad_norm": 0.019426610320806503,
+      "learning_rate": 4.533333333333334e-07,
+      "loss": 0.4264,
+      "step": 400
+    },
+    {
+      "epoch": 2.9448183041722746,
+      "grad_norm": 0.020601661875844002,
+      "learning_rate": 1.866666666666667e-07,
+      "loss": 0.4282,
+      "step": 410
+    },
+    {
+      "epoch": 2.9950650515926425,
+      "step": 417,
+      "total_flos": 1.1169746555638907e+18,
+      "train_loss": 0.5152784274350539,
+      "train_runtime": 8326.5034,
+      "train_samples_per_second": 22.484,
+      "train_steps_per_second": 0.05
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 417,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.1169746555638907e+18,
+  "train_batch_size": 14,
+  "trial_name": null,
+  "trial_params": null
+}