jainamit
/

qwen-2.5-3b-r1-countdown

@@ -27,7 +27,7 @@ print(output["generated_text"])
 ## Training procedure
-[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/jainamitnew-penn-state/huggingface/runs/6ccmz8b9)
 This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).

 ## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/jainamitnew-penn-state/huggingface/runs/i5y5acqm)
 This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).

all_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "total_flos": 0.0,
-    "train_loss": 0.00016750305739037685,
-    "train_runtime": 21555.541,
     "train_samples": 45000,
-    "train_samples_per_second": 0.501,
-    "train_steps_per_second": 0.021
 }

 {
     "total_flos": 0.0,
+    "train_loss": 5.154237393785408e-07,
+    "train_runtime": 77.0261,
     "train_samples": 45000,
+    "train_samples_per_second": 140.212,
+    "train_steps_per_second": 5.842
 }

config.json CHANGED Viewed

@@ -23,7 +23,7 @@
   "tie_word_embeddings": true,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.48.1",
-  "use_cache": false,
   "use_sliding_window": false,
   "vocab_size": 151936
 }

   "tie_word_embeddings": true,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.48.1",
+  "use_cache": true,
   "use_sliding_window": false,
   "vocab_size": 151936
 }

train_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "total_flos": 0.0,
-    "train_loss": 0.00016750305739037685,
-    "train_runtime": 21555.541,
     "train_samples": 45000,
-    "train_samples_per_second": 0.501,
-    "train_steps_per_second": 0.021
 }

 {
     "total_flos": 0.0,
+    "train_loss": 5.154237393785408e-07,
+    "train_runtime": 77.0261,
     "train_samples": 45000,
+    "train_samples_per_second": 140.212,
+    "train_steps_per_second": 5.842
 }

trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.24,
   "eval_steps": 500,
-  "global_step": 450,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -2934,13 +2934,19 @@
       "step": 450
     },
     {
-      "epoch": 0.24,
-      "step": 450,
       "total_flos": 0.0,
-      "train_loss": 0.00016750305739037685,
-      "train_runtime": 21555.541,
-      "train_samples_per_second": 0.501,
-      "train_steps_per_second": 0.021
     }
   ],
   "logging_steps": 2,

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.24053333333333332,
   "eval_steps": 500,
+  "global_step": 451,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "step": 450
     },
     {
+      "completion_length": 381.8854293823242,
+      "epoch": 0.24053333333333332,
+      "kl": 0.2254638671875,
+      "reward": 1.348958358168602,
+      "reward_std": 0.22792584728449583,
+      "rewards/equation_reward_func": 0.4270833469927311,
+      "rewards/format_reward_func": 0.9218750149011612,
+      "step": 451,
       "total_flos": 0.0,
+      "train_loss": 5.154237393785408e-07,
+      "train_runtime": 77.0261,
+      "train_samples_per_second": 140.212,
+      "train_steps_per_second": 5.842
     }
   ],
   "logging_steps": 2,