End of training

Browse files

Files changed (7) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +857 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: llama3
 base_model: meta-llama/Meta-Llama-3-8B
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: OH_DCFT_V3_wo_evol_instruct_140k
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # OH_DCFT_V3_wo_evol_instruct_140k
-This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.6165

 base_model: meta-llama/Meta-Llama-3-8B
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: OH_DCFT_V3_wo_evol_instruct_140k
 # OH_DCFT_V3_wo_evol_instruct_140k
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the mlfoundations-dev/OH_DCFT_V3_wo_evol_instruct_140k dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.6165

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 3.0,
+    "eval_loss": 0.6164913177490234,
+    "eval_runtime": 203.8494,
+    "eval_samples_per_second": 49.836,
+    "eval_steps_per_second": 0.392,
+    "total_flos": 1894048365281280.0,
+    "train_loss": 0.5957063028603713,
+    "train_runtime": 34171.7918,
+    "train_samples_per_second": 16.944,
+    "train_steps_per_second": 0.033
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 3.0,
+    "eval_loss": 0.6164913177490234,
+    "eval_runtime": 203.8494,
+    "eval_samples_per_second": 49.836,
+    "eval_steps_per_second": 0.392
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 3.0,
+    "total_flos": 1894048365281280.0,
+    "train_loss": 0.5957063028603713,
+    "train_runtime": 34171.7918,
+    "train_samples_per_second": 16.944,
+    "train_steps_per_second": 0.033
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,857 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 1131,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.026525198938992044,
+      "grad_norm": 2.329897814074136,
+      "learning_rate": 5e-06,
+      "loss": 0.8884,
+      "step": 10
+    },
+    {
+      "epoch": 0.05305039787798409,
+      "grad_norm": 3.8595726803473838,
+      "learning_rate": 5e-06,
+      "loss": 0.7961,
+      "step": 20
+    },
+    {
+      "epoch": 0.07957559681697612,
+      "grad_norm": 1.096843401953888,
+      "learning_rate": 5e-06,
+      "loss": 0.7478,
+      "step": 30
+    },
+    {
+      "epoch": 0.10610079575596817,
+      "grad_norm": 1.1211395081081943,
+      "learning_rate": 5e-06,
+      "loss": 0.719,
+      "step": 40
+    },
+    {
+      "epoch": 0.13262599469496023,
+      "grad_norm": 1.4282528066175646,
+      "learning_rate": 5e-06,
+      "loss": 0.7052,
+      "step": 50
+    },
+    {
+      "epoch": 0.15915119363395225,
+      "grad_norm": 0.7596233206271922,
+      "learning_rate": 5e-06,
+      "loss": 0.694,
+      "step": 60
+    },
+    {
+      "epoch": 0.1856763925729443,
+      "grad_norm": 1.2777068139599712,
+      "learning_rate": 5e-06,
+      "loss": 0.6882,
+      "step": 70
+    },
+    {
+      "epoch": 0.21220159151193635,
+      "grad_norm": 1.1383877936666695,
+      "learning_rate": 5e-06,
+      "loss": 0.6771,
+      "step": 80
+    },
+    {
+      "epoch": 0.23872679045092837,
+      "grad_norm": 1.0060703202699885,
+      "learning_rate": 5e-06,
+      "loss": 0.6806,
+      "step": 90
+    },
+    {
+      "epoch": 0.26525198938992045,
+      "grad_norm": 1.0179544972489034,
+      "learning_rate": 5e-06,
+      "loss": 0.6665,
+      "step": 100
+    },
+    {
+      "epoch": 0.2917771883289125,
+      "grad_norm": 0.6720815026983992,
+      "learning_rate": 5e-06,
+      "loss": 0.6637,
+      "step": 110
+    },
+    {
+      "epoch": 0.3183023872679045,
+      "grad_norm": 0.9241587286794601,
+      "learning_rate": 5e-06,
+      "loss": 0.6546,
+      "step": 120
+    },
+    {
+      "epoch": 0.3448275862068966,
+      "grad_norm": 0.8824991950738043,
+      "learning_rate": 5e-06,
+      "loss": 0.6522,
+      "step": 130
+    },
+    {
+      "epoch": 0.3713527851458886,
+      "grad_norm": 0.8302884563721783,
+      "learning_rate": 5e-06,
+      "loss": 0.653,
+      "step": 140
+    },
+    {
+      "epoch": 0.3978779840848806,
+      "grad_norm": 0.8327701504024553,
+      "learning_rate": 5e-06,
+      "loss": 0.6455,
+      "step": 150
+    },
+    {
+      "epoch": 0.4244031830238727,
+      "grad_norm": 0.5492876289580202,
+      "learning_rate": 5e-06,
+      "loss": 0.6553,
+      "step": 160
+    },
+    {
+      "epoch": 0.4509283819628647,
+      "grad_norm": 0.7119323938911869,
+      "learning_rate": 5e-06,
+      "loss": 0.6448,
+      "step": 170
+    },
+    {
+      "epoch": 0.47745358090185674,
+      "grad_norm": 0.8042997280772343,
+      "learning_rate": 5e-06,
+      "loss": 0.6403,
+      "step": 180
+    },
+    {
+      "epoch": 0.5039787798408488,
+      "grad_norm": 0.7810674802506331,
+      "learning_rate": 5e-06,
+      "loss": 0.655,
+      "step": 190
+    },
+    {
+      "epoch": 0.5305039787798409,
+      "grad_norm": 0.5391416674289061,
+      "learning_rate": 5e-06,
+      "loss": 0.6398,
+      "step": 200
+    },
+    {
+      "epoch": 0.5570291777188329,
+      "grad_norm": 0.6885851666539189,
+      "learning_rate": 5e-06,
+      "loss": 0.6452,
+      "step": 210
+    },
+    {
+      "epoch": 0.583554376657825,
+      "grad_norm": 0.6168676023297931,
+      "learning_rate": 5e-06,
+      "loss": 0.6443,
+      "step": 220
+    },
+    {
+      "epoch": 0.610079575596817,
+      "grad_norm": 0.5971017602698001,
+      "learning_rate": 5e-06,
+      "loss": 0.636,
+      "step": 230
+    },
+    {
+      "epoch": 0.636604774535809,
+      "grad_norm": 0.6022582014461996,
+      "learning_rate": 5e-06,
+      "loss": 0.642,
+      "step": 240
+    },
+    {
+      "epoch": 0.6631299734748011,
+      "grad_norm": 0.500260575545034,
+      "learning_rate": 5e-06,
+      "loss": 0.6344,
+      "step": 250
+    },
+    {
+      "epoch": 0.6896551724137931,
+      "grad_norm": 0.537336125918856,
+      "learning_rate": 5e-06,
+      "loss": 0.639,
+      "step": 260
+    },
+    {
+      "epoch": 0.7161803713527851,
+      "grad_norm": 0.5365620197699547,
+      "learning_rate": 5e-06,
+      "loss": 0.6403,
+      "step": 270
+    },
+    {
+      "epoch": 0.7427055702917772,
+      "grad_norm": 0.7683273733374665,
+      "learning_rate": 5e-06,
+      "loss": 0.6401,
+      "step": 280
+    },
+    {
+      "epoch": 0.7692307692307693,
+      "grad_norm": 0.5276716709186424,
+      "learning_rate": 5e-06,
+      "loss": 0.6338,
+      "step": 290
+    },
+    {
+      "epoch": 0.7957559681697612,
+      "grad_norm": 0.5074059045052902,
+      "learning_rate": 5e-06,
+      "loss": 0.6371,
+      "step": 300
+    },
+    {
+      "epoch": 0.8222811671087533,
+      "grad_norm": 0.67204833216127,
+      "learning_rate": 5e-06,
+      "loss": 0.639,
+      "step": 310
+    },
+    {
+      "epoch": 0.8488063660477454,
+      "grad_norm": 0.5619464110309291,
+      "learning_rate": 5e-06,
+      "loss": 0.6427,
+      "step": 320
+    },
+    {
+      "epoch": 0.8753315649867374,
+      "grad_norm": 0.5724558992220249,
+      "learning_rate": 5e-06,
+      "loss": 0.6294,
+      "step": 330
+    },
+    {
+      "epoch": 0.9018567639257294,
+      "grad_norm": 0.619777380357849,
+      "learning_rate": 5e-06,
+      "loss": 0.6334,
+      "step": 340
+    },
+    {
+      "epoch": 0.9283819628647215,
+      "grad_norm": 0.7434000365286857,
+      "learning_rate": 5e-06,
+      "loss": 0.6329,
+      "step": 350
+    },
+    {
+      "epoch": 0.9549071618037135,
+      "grad_norm": 0.5490872236122448,
+      "learning_rate": 5e-06,
+      "loss": 0.6286,
+      "step": 360
+    },
+    {
+      "epoch": 0.9814323607427056,
+      "grad_norm": 0.584736306154939,
+      "learning_rate": 5e-06,
+      "loss": 0.6296,
+      "step": 370
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.6237567663192749,
+      "eval_runtime": 202.8984,
+      "eval_samples_per_second": 50.069,
+      "eval_steps_per_second": 0.394,
+      "step": 377
+    },
+    {
+      "epoch": 1.0079575596816976,
+      "grad_norm": 1.012512560233075,
+      "learning_rate": 5e-06,
+      "loss": 0.614,
+      "step": 380
+    },
+    {
+      "epoch": 1.0344827586206897,
+      "grad_norm": 0.6423484790332888,
+      "learning_rate": 5e-06,
+      "loss": 0.5872,
+      "step": 390
+    },
+    {
+      "epoch": 1.0610079575596818,
+      "grad_norm": 0.6707809448038629,
+      "learning_rate": 5e-06,
+      "loss": 0.5889,
+      "step": 400
+    },
+    {
+      "epoch": 1.0875331564986737,
+      "grad_norm": 0.6650816829620442,
+      "learning_rate": 5e-06,
+      "loss": 0.5807,
+      "step": 410
+    },
+    {
+      "epoch": 1.1140583554376657,
+      "grad_norm": 0.6224384353734153,
+      "learning_rate": 5e-06,
+      "loss": 0.5915,
+      "step": 420
+    },
+    {
+      "epoch": 1.1405835543766578,
+      "grad_norm": 0.5684723509980193,
+      "learning_rate": 5e-06,
+      "loss": 0.5868,
+      "step": 430
+    },
+    {
+      "epoch": 1.16710875331565,
+      "grad_norm": 0.7059253953598508,
+      "learning_rate": 5e-06,
+      "loss": 0.5766,
+      "step": 440
+    },
+    {
+      "epoch": 1.193633952254642,
+      "grad_norm": 0.523842539770435,
+      "learning_rate": 5e-06,
+      "loss": 0.5848,
+      "step": 450
+    },
+    {
+      "epoch": 1.2201591511936338,
+      "grad_norm": 0.6714226112627073,
+      "learning_rate": 5e-06,
+      "loss": 0.5832,
+      "step": 460
+    },
+    {
+      "epoch": 1.246684350132626,
+      "grad_norm": 0.7286525903858807,
+      "learning_rate": 5e-06,
+      "loss": 0.589,
+      "step": 470
+    },
+    {
+      "epoch": 1.273209549071618,
+      "grad_norm": 0.52664344746201,
+      "learning_rate": 5e-06,
+      "loss": 0.5781,
+      "step": 480
+    },
+    {
+      "epoch": 1.29973474801061,
+      "grad_norm": 0.5394135765946012,
+      "learning_rate": 5e-06,
+      "loss": 0.5793,
+      "step": 490
+    },
+    {
+      "epoch": 1.3262599469496021,
+      "grad_norm": 0.5473942505095254,
+      "learning_rate": 5e-06,
+      "loss": 0.5865,
+      "step": 500
+    },
+    {
+      "epoch": 1.3527851458885942,
+      "grad_norm": 0.5239209443855906,
+      "learning_rate": 5e-06,
+      "loss": 0.5851,
+      "step": 510
+    },
+    {
+      "epoch": 1.3793103448275863,
+      "grad_norm": 0.638268567207473,
+      "learning_rate": 5e-06,
+      "loss": 0.5868,
+      "step": 520
+    },
+    {
+      "epoch": 1.4058355437665782,
+      "grad_norm": 0.5569448150591939,
+      "learning_rate": 5e-06,
+      "loss": 0.5851,
+      "step": 530
+    },
+    {
+      "epoch": 1.4323607427055702,
+      "grad_norm": 0.5739975276281734,
+      "learning_rate": 5e-06,
+      "loss": 0.5807,
+      "step": 540
+    },
+    {
+      "epoch": 1.4588859416445623,
+      "grad_norm": 0.5662395827613842,
+      "learning_rate": 5e-06,
+      "loss": 0.5768,
+      "step": 550
+    },
+    {
+      "epoch": 1.4854111405835544,
+      "grad_norm": 0.7163603774804506,
+      "learning_rate": 5e-06,
+      "loss": 0.5843,
+      "step": 560
+    },
+    {
+      "epoch": 1.5119363395225465,
+      "grad_norm": 0.7163733609893056,
+      "learning_rate": 5e-06,
+      "loss": 0.5917,
+      "step": 570
+    },
+    {
+      "epoch": 1.5384615384615383,
+      "grad_norm": 0.6180542754631709,
+      "learning_rate": 5e-06,
+      "loss": 0.5817,
+      "step": 580
+    },
+    {
+      "epoch": 1.5649867374005306,
+      "grad_norm": 0.5638514267037327,
+      "learning_rate": 5e-06,
+      "loss": 0.5862,
+      "step": 590
+    },
+    {
+      "epoch": 1.5915119363395225,
+      "grad_norm": 0.561015542053421,
+      "learning_rate": 5e-06,
+      "loss": 0.5815,
+      "step": 600
+    },
+    {
+      "epoch": 1.6180371352785146,
+      "grad_norm": 0.5019925924846618,
+      "learning_rate": 5e-06,
+      "loss": 0.5854,
+      "step": 610
+    },
+    {
+      "epoch": 1.6445623342175066,
+      "grad_norm": 0.6456145062380878,
+      "learning_rate": 5e-06,
+      "loss": 0.584,
+      "step": 620
+    },
+    {
+      "epoch": 1.6710875331564987,
+      "grad_norm": 0.6273758065445275,
+      "learning_rate": 5e-06,
+      "loss": 0.5794,
+      "step": 630
+    },
+    {
+      "epoch": 1.6976127320954908,
+      "grad_norm": 0.47537875001219887,
+      "learning_rate": 5e-06,
+      "loss": 0.5831,
+      "step": 640
+    },
+    {
+      "epoch": 1.7241379310344827,
+      "grad_norm": 0.5799224686148554,
+      "learning_rate": 5e-06,
+      "loss": 0.5876,
+      "step": 650
+    },
+    {
+      "epoch": 1.750663129973475,
+      "grad_norm": 0.6477709788042249,
+      "learning_rate": 5e-06,
+      "loss": 0.5786,
+      "step": 660
+    },
+    {
+      "epoch": 1.7771883289124668,
+      "grad_norm": 0.5398722777232823,
+      "learning_rate": 5e-06,
+      "loss": 0.5788,
+      "step": 670
+    },
+    {
+      "epoch": 1.8037135278514589,
+      "grad_norm": 0.503790352480121,
+      "learning_rate": 5e-06,
+      "loss": 0.5789,
+      "step": 680
+    },
+    {
+      "epoch": 1.830238726790451,
+      "grad_norm": 0.5980642131822352,
+      "learning_rate": 5e-06,
+      "loss": 0.5836,
+      "step": 690
+    },
+    {
+      "epoch": 1.8567639257294428,
+      "grad_norm": 0.5144731472377694,
+      "learning_rate": 5e-06,
+      "loss": 0.5818,
+      "step": 700
+    },
+    {
+      "epoch": 1.8832891246684351,
+      "grad_norm": 0.748547689970371,
+      "learning_rate": 5e-06,
+      "loss": 0.5882,
+      "step": 710
+    },
+    {
+      "epoch": 1.909814323607427,
+      "grad_norm": 0.5454719348703825,
+      "learning_rate": 5e-06,
+      "loss": 0.5857,
+      "step": 720
+    },
+    {
+      "epoch": 1.936339522546419,
+      "grad_norm": 0.537701148828446,
+      "learning_rate": 5e-06,
+      "loss": 0.5919,
+      "step": 730
+    },
+    {
+      "epoch": 1.9628647214854111,
+      "grad_norm": 0.5333812418899406,
+      "learning_rate": 5e-06,
+      "loss": 0.5804,
+      "step": 740
+    },
+    {
+      "epoch": 1.9893899204244032,
+      "grad_norm": 0.5945192673823688,
+      "learning_rate": 5e-06,
+      "loss": 0.5875,
+      "step": 750
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 0.6138430833816528,
+      "eval_runtime": 204.1448,
+      "eval_samples_per_second": 49.764,
+      "eval_steps_per_second": 0.392,
+      "step": 754
+    },
+    {
+      "epoch": 2.0159151193633953,
+      "grad_norm": 0.70666737481995,
+      "learning_rate": 5e-06,
+      "loss": 0.5507,
+      "step": 760
+    },
+    {
+      "epoch": 2.042440318302387,
+      "grad_norm": 0.6653710012047342,
+      "learning_rate": 5e-06,
+      "loss": 0.5366,
+      "step": 770
+    },
+    {
+      "epoch": 2.0689655172413794,
+      "grad_norm": 0.6392050869043902,
+      "learning_rate": 5e-06,
+      "loss": 0.5362,
+      "step": 780
+    },
+    {
+      "epoch": 2.0954907161803713,
+      "grad_norm": 0.552417268591106,
+      "learning_rate": 5e-06,
+      "loss": 0.537,
+      "step": 790
+    },
+    {
+      "epoch": 2.1220159151193636,
+      "grad_norm": 0.5237418246816455,
+      "learning_rate": 5e-06,
+      "loss": 0.5387,
+      "step": 800
+    },
+    {
+      "epoch": 2.1485411140583555,
+      "grad_norm": 0.5931401358357342,
+      "learning_rate": 5e-06,
+      "loss": 0.5383,
+      "step": 810
+    },
+    {
+      "epoch": 2.1750663129973473,
+      "grad_norm": 0.5859178511093684,
+      "learning_rate": 5e-06,
+      "loss": 0.5385,
+      "step": 820
+    },
+    {
+      "epoch": 2.2015915119363396,
+      "grad_norm": 0.54174318871251,
+      "learning_rate": 5e-06,
+      "loss": 0.536,
+      "step": 830
+    },
+    {
+      "epoch": 2.2281167108753315,
+      "grad_norm": 0.6719837144625476,
+      "learning_rate": 5e-06,
+      "loss": 0.5443,
+      "step": 840
+    },
+    {
+      "epoch": 2.2546419098143238,
+      "grad_norm": 0.5913384023217347,
+      "learning_rate": 5e-06,
+      "loss": 0.5275,
+      "step": 850
+    },
+    {
+      "epoch": 2.2811671087533156,
+      "grad_norm": 0.5589992603235397,
+      "learning_rate": 5e-06,
+      "loss": 0.5324,
+      "step": 860
+    },
+    {
+      "epoch": 2.3076923076923075,
+      "grad_norm": 0.7009013738100872,
+      "learning_rate": 5e-06,
+      "loss": 0.5358,
+      "step": 870
+    },
+    {
+      "epoch": 2.3342175066313,
+      "grad_norm": 0.5192518963562719,
+      "learning_rate": 5e-06,
+      "loss": 0.5397,
+      "step": 880
+    },
+    {
+      "epoch": 2.3607427055702916,
+      "grad_norm": 0.5673317232053819,
+      "learning_rate": 5e-06,
+      "loss": 0.5441,
+      "step": 890
+    },
+    {
+      "epoch": 2.387267904509284,
+      "grad_norm": 0.6565705800504016,
+      "learning_rate": 5e-06,
+      "loss": 0.5376,
+      "step": 900
+    },
+    {
+      "epoch": 2.413793103448276,
+      "grad_norm": 0.6497220724543952,
+      "learning_rate": 5e-06,
+      "loss": 0.5309,
+      "step": 910
+    },
+    {
+      "epoch": 2.4403183023872677,
+      "grad_norm": 0.5907005582908587,
+      "learning_rate": 5e-06,
+      "loss": 0.535,
+      "step": 920
+    },
+    {
+      "epoch": 2.46684350132626,
+      "grad_norm": 0.6229830599060672,
+      "learning_rate": 5e-06,
+      "loss": 0.5497,
+      "step": 930
+    },
+    {
+      "epoch": 2.493368700265252,
+      "grad_norm": 0.5474753923784902,
+      "learning_rate": 5e-06,
+      "loss": 0.5409,
+      "step": 940
+    },
+    {
+      "epoch": 2.519893899204244,
+      "grad_norm": 0.5472523872985887,
+      "learning_rate": 5e-06,
+      "loss": 0.536,
+      "step": 950
+    },
+    {
+      "epoch": 2.546419098143236,
+      "grad_norm": 0.9882268568658281,
+      "learning_rate": 5e-06,
+      "loss": 0.5375,
+      "step": 960
+    },
+    {
+      "epoch": 2.5729442970822283,
+      "grad_norm": 0.6076358216087694,
+      "learning_rate": 5e-06,
+      "loss": 0.5417,
+      "step": 970
+    },
+    {
+      "epoch": 2.59946949602122,
+      "grad_norm": 0.6233857713542503,
+      "learning_rate": 5e-06,
+      "loss": 0.543,
+      "step": 980
+    },
+    {
+      "epoch": 2.6259946949602124,
+      "grad_norm": 0.662156931341361,
+      "learning_rate": 5e-06,
+      "loss": 0.5437,
+      "step": 990
+    },
+    {
+      "epoch": 2.6525198938992043,
+      "grad_norm": 0.520667113721229,
+      "learning_rate": 5e-06,
+      "loss": 0.5392,
+      "step": 1000
+    },
+    {
+      "epoch": 2.679045092838196,
+      "grad_norm": 0.5971607456495881,
+      "learning_rate": 5e-06,
+      "loss": 0.5357,
+      "step": 1010
+    },
+    {
+      "epoch": 2.7055702917771884,
+      "grad_norm": 0.5939890330376003,
+      "learning_rate": 5e-06,
+      "loss": 0.5371,
+      "step": 1020
+    },
+    {
+      "epoch": 2.7320954907161803,
+      "grad_norm": 0.5773060764723967,
+      "learning_rate": 5e-06,
+      "loss": 0.5359,
+      "step": 1030
+    },
+    {
+      "epoch": 2.7586206896551726,
+      "grad_norm": 0.5827565949490094,
+      "learning_rate": 5e-06,
+      "loss": 0.5431,
+      "step": 1040
+    },
+    {
+      "epoch": 2.7851458885941645,
+      "grad_norm": 0.6314529885890559,
+      "learning_rate": 5e-06,
+      "loss": 0.5444,
+      "step": 1050
+    },
+    {
+      "epoch": 2.8116710875331563,
+      "grad_norm": 0.5422009039427094,
+      "learning_rate": 5e-06,
+      "loss": 0.54,
+      "step": 1060
+    },
+    {
+      "epoch": 2.8381962864721486,
+      "grad_norm": 0.6028981522236778,
+      "learning_rate": 5e-06,
+      "loss": 0.5433,
+      "step": 1070
+    },
+    {
+      "epoch": 2.8647214854111405,
+      "grad_norm": 0.5970004720986751,
+      "learning_rate": 5e-06,
+      "loss": 0.5461,
+      "step": 1080
+    },
+    {
+      "epoch": 2.8912466843501328,
+      "grad_norm": 0.574798524733574,
+      "learning_rate": 5e-06,
+      "loss": 0.5397,
+      "step": 1090
+    },
+    {
+      "epoch": 2.9177718832891246,
+      "grad_norm": 0.5790776127397061,
+      "learning_rate": 5e-06,
+      "loss": 0.5404,
+      "step": 1100
+    },
+    {
+      "epoch": 2.9442970822281165,
+      "grad_norm": 0.5249016599382924,
+      "learning_rate": 5e-06,
+      "loss": 0.5461,
+      "step": 1110
+    },
+    {
+      "epoch": 2.970822281167109,
+      "grad_norm": 0.6578189789009287,
+      "learning_rate": 5e-06,
+      "loss": 0.5387,
+      "step": 1120
+    },
+    {
+      "epoch": 2.9973474801061006,
+      "grad_norm": 0.5226173945421619,
+      "learning_rate": 5e-06,
+      "loss": 0.5391,
+      "step": 1130
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 0.6164913177490234,
+      "eval_runtime": 204.5208,
+      "eval_samples_per_second": 49.672,
+      "eval_steps_per_second": 0.391,
+      "step": 1131
+    },
+    {
+      "epoch": 3.0,
+      "step": 1131,
+      "total_flos": 1894048365281280.0,
+      "train_loss": 0.5957063028603713,
+      "train_runtime": 34171.7918,
+      "train_samples_per_second": 16.944,
+      "train_steps_per_second": 0.033
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1131,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1894048365281280.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed