End of training

Browse files

Files changed (7) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +948 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: llama3
 base_model: meta-llama/Meta-Llama-3-8B
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: OH_DCFT_V3_wo_dataforge_economics
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # OH_DCFT_V3_wo_dataforge_economics
-This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.6410

 base_model: meta-llama/Meta-Llama-3-8B
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: OH_DCFT_V3_wo_dataforge_economics
 # OH_DCFT_V3_wo_dataforge_economics
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the mlfoundations-dev/OH_DCFT_V3_wo_dataforge_economics dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.6410

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 2.9982238010657194,
+    "eval_loss": 0.6410338282585144,
+    "eval_runtime": 226.5313,
+    "eval_samples_per_second": 50.214,
+    "eval_steps_per_second": 0.393,
+    "total_flos": 2120178393415680.0,
+    "train_loss": 0.6170312019321026,
+    "train_runtime": 37849.9965,
+    "train_samples_per_second": 17.13,
+    "train_steps_per_second": 0.033
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 2.9982238010657194,
+    "eval_loss": 0.6410338282585144,
+    "eval_runtime": 226.5313,
+    "eval_samples_per_second": 50.214,
+    "eval_steps_per_second": 0.393
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.9982238010657194,
+    "total_flos": 2120178393415680.0,
+    "train_loss": 0.6170312019321026,
+    "train_runtime": 37849.9965,
+    "train_samples_per_second": 17.13,
+    "train_steps_per_second": 0.033
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,948 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.9982238010657194,
+  "eval_steps": 500,
+  "global_step": 1266,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.023682652457075192,
+      "grad_norm": 1.816531801378568,
+      "learning_rate": 5e-06,
+      "loss": 0.9186,
+      "step": 10
+    },
+    {
+      "epoch": 0.047365304914150384,
+      "grad_norm": 1.3574653188634527,
+      "learning_rate": 5e-06,
+      "loss": 0.8003,
+      "step": 20
+    },
+    {
+      "epoch": 0.07104795737122557,
+      "grad_norm": 1.5669400481998332,
+      "learning_rate": 5e-06,
+      "loss": 0.7709,
+      "step": 30
+    },
+    {
+      "epoch": 0.09473060982830077,
+      "grad_norm": 1.693017888571637,
+      "learning_rate": 5e-06,
+      "loss": 0.7443,
+      "step": 40
+    },
+    {
+      "epoch": 0.11841326228537596,
+      "grad_norm": 1.4722298325308145,
+      "learning_rate": 5e-06,
+      "loss": 0.7337,
+      "step": 50
+    },
+    {
+      "epoch": 0.14209591474245115,
+      "grad_norm": 0.6948406074694712,
+      "learning_rate": 5e-06,
+      "loss": 0.7183,
+      "step": 60
+    },
+    {
+      "epoch": 0.16577856719952636,
+      "grad_norm": 1.0658863949894195,
+      "learning_rate": 5e-06,
+      "loss": 0.7146,
+      "step": 70
+    },
+    {
+      "epoch": 0.18946121965660154,
+      "grad_norm": 0.6741474876420499,
+      "learning_rate": 5e-06,
+      "loss": 0.6984,
+      "step": 80
+    },
+    {
+      "epoch": 0.21314387211367672,
+      "grad_norm": 1.0831273123966574,
+      "learning_rate": 5e-06,
+      "loss": 0.7001,
+      "step": 90
+    },
+    {
+      "epoch": 0.23682652457075193,
+      "grad_norm": 1.6635919094429064,
+      "learning_rate": 5e-06,
+      "loss": 0.6886,
+      "step": 100
+    },
+    {
+      "epoch": 0.2605091770278271,
+      "grad_norm": 0.8636474722702722,
+      "learning_rate": 5e-06,
+      "loss": 0.686,
+      "step": 110
+    },
+    {
+      "epoch": 0.2841918294849023,
+      "grad_norm": 0.7926351502503324,
+      "learning_rate": 5e-06,
+      "loss": 0.6817,
+      "step": 120
+    },
+    {
+      "epoch": 0.30787448194197753,
+      "grad_norm": 0.5901918000373718,
+      "learning_rate": 5e-06,
+      "loss": 0.692,
+      "step": 130
+    },
+    {
+      "epoch": 0.3315571343990527,
+      "grad_norm": 0.6148247600757576,
+      "learning_rate": 5e-06,
+      "loss": 0.6817,
+      "step": 140
+    },
+    {
+      "epoch": 0.3552397868561279,
+      "grad_norm": 0.5101477855796542,
+      "learning_rate": 5e-06,
+      "loss": 0.6853,
+      "step": 150
+    },
+    {
+      "epoch": 0.3789224393132031,
+      "grad_norm": 0.5704569063807804,
+      "learning_rate": 5e-06,
+      "loss": 0.6737,
+      "step": 160
+    },
+    {
+      "epoch": 0.40260509177027826,
+      "grad_norm": 0.6027371888875237,
+      "learning_rate": 5e-06,
+      "loss": 0.6774,
+      "step": 170
+    },
+    {
+      "epoch": 0.42628774422735344,
+      "grad_norm": 0.6683781224223748,
+      "learning_rate": 5e-06,
+      "loss": 0.6688,
+      "step": 180
+    },
+    {
+      "epoch": 0.4499703966844287,
+      "grad_norm": 0.5016914580041181,
+      "learning_rate": 5e-06,
+      "loss": 0.676,
+      "step": 190
+    },
+    {
+      "epoch": 0.47365304914150386,
+      "grad_norm": 0.5452609890724252,
+      "learning_rate": 5e-06,
+      "loss": 0.667,
+      "step": 200
+    },
+    {
+      "epoch": 0.49733570159857904,
+      "grad_norm": 0.6510845954594989,
+      "learning_rate": 5e-06,
+      "loss": 0.6691,
+      "step": 210
+    },
+    {
+      "epoch": 0.5210183540556542,
+      "grad_norm": 0.5161096609174309,
+      "learning_rate": 5e-06,
+      "loss": 0.6666,
+      "step": 220
+    },
+    {
+      "epoch": 0.5447010065127295,
+      "grad_norm": 0.48148888185279093,
+      "learning_rate": 5e-06,
+      "loss": 0.6705,
+      "step": 230
+    },
+    {
+      "epoch": 0.5683836589698046,
+      "grad_norm": 0.646541503129204,
+      "learning_rate": 5e-06,
+      "loss": 0.6615,
+      "step": 240
+    },
+    {
+      "epoch": 0.5920663114268798,
+      "grad_norm": 0.5813160913392431,
+      "learning_rate": 5e-06,
+      "loss": 0.6761,
+      "step": 250
+    },
+    {
+      "epoch": 0.6157489638839551,
+      "grad_norm": 0.6228219376370357,
+      "learning_rate": 5e-06,
+      "loss": 0.6645,
+      "step": 260
+    },
+    {
+      "epoch": 0.6394316163410302,
+      "grad_norm": 0.532227126517761,
+      "learning_rate": 5e-06,
+      "loss": 0.6634,
+      "step": 270
+    },
+    {
+      "epoch": 0.6631142687981054,
+      "grad_norm": 0.5335777471459527,
+      "learning_rate": 5e-06,
+      "loss": 0.6707,
+      "step": 280
+    },
+    {
+      "epoch": 0.6867969212551805,
+      "grad_norm": 0.5859556398404715,
+      "learning_rate": 5e-06,
+      "loss": 0.6595,
+      "step": 290
+    },
+    {
+      "epoch": 0.7104795737122558,
+      "grad_norm": 0.678661754791658,
+      "learning_rate": 5e-06,
+      "loss": 0.6635,
+      "step": 300
+    },
+    {
+      "epoch": 0.7341622261693309,
+      "grad_norm": 0.5182311919791005,
+      "learning_rate": 5e-06,
+      "loss": 0.6546,
+      "step": 310
+    },
+    {
+      "epoch": 0.7578448786264061,
+      "grad_norm": 0.5694216979735989,
+      "learning_rate": 5e-06,
+      "loss": 0.6541,
+      "step": 320
+    },
+    {
+      "epoch": 0.7815275310834814,
+      "grad_norm": 0.5806274782090943,
+      "learning_rate": 5e-06,
+      "loss": 0.6581,
+      "step": 330
+    },
+    {
+      "epoch": 0.8052101835405565,
+      "grad_norm": 0.57308874729709,
+      "learning_rate": 5e-06,
+      "loss": 0.661,
+      "step": 340
+    },
+    {
+      "epoch": 0.8288928359976317,
+      "grad_norm": 0.48566616576972127,
+      "learning_rate": 5e-06,
+      "loss": 0.6514,
+      "step": 350
+    },
+    {
+      "epoch": 0.8525754884547069,
+      "grad_norm": 0.5177674580432304,
+      "learning_rate": 5e-06,
+      "loss": 0.6521,
+      "step": 360
+    },
+    {
+      "epoch": 0.8762581409117821,
+      "grad_norm": 0.7662128776841642,
+      "learning_rate": 5e-06,
+      "loss": 0.6521,
+      "step": 370
+    },
+    {
+      "epoch": 0.8999407933688574,
+      "grad_norm": 0.7194268558213192,
+      "learning_rate": 5e-06,
+      "loss": 0.6527,
+      "step": 380
+    },
+    {
+      "epoch": 0.9236234458259325,
+      "grad_norm": 0.5317577993198188,
+      "learning_rate": 5e-06,
+      "loss": 0.6462,
+      "step": 390
+    },
+    {
+      "epoch": 0.9473060982830077,
+      "grad_norm": 0.6130071094405808,
+      "learning_rate": 5e-06,
+      "loss": 0.6536,
+      "step": 400
+    },
+    {
+      "epoch": 0.9709887507400828,
+      "grad_norm": 0.5054262813488729,
+      "learning_rate": 5e-06,
+      "loss": 0.651,
+      "step": 410
+    },
+    {
+      "epoch": 0.9946714031971581,
+      "grad_norm": 0.5216224953516784,
+      "learning_rate": 5e-06,
+      "loss": 0.6504,
+      "step": 420
+    },
+    {
+      "epoch": 0.9994079336885732,
+      "eval_loss": 0.6479789018630981,
+      "eval_runtime": 225.9605,
+      "eval_samples_per_second": 50.341,
+      "eval_steps_per_second": 0.394,
+      "step": 422
+    },
+    {
+      "epoch": 1.0183540556542332,
+      "grad_norm": 0.6448097818743337,
+      "learning_rate": 5e-06,
+      "loss": 0.6143,
+      "step": 430
+    },
+    {
+      "epoch": 1.0420367081113084,
+      "grad_norm": 0.5557066787595923,
+      "learning_rate": 5e-06,
+      "loss": 0.6112,
+      "step": 440
+    },
+    {
+      "epoch": 1.0657193605683837,
+      "grad_norm": 0.6491926304674187,
+      "learning_rate": 5e-06,
+      "loss": 0.5991,
+      "step": 450
+    },
+    {
+      "epoch": 1.089402013025459,
+      "grad_norm": 0.5604079979316403,
+      "learning_rate": 5e-06,
+      "loss": 0.6058,
+      "step": 460
+    },
+    {
+      "epoch": 1.1130846654825342,
+      "grad_norm": 0.5772544741030392,
+      "learning_rate": 5e-06,
+      "loss": 0.6075,
+      "step": 470
+    },
+    {
+      "epoch": 1.1367673179396092,
+      "grad_norm": 0.4792859571457941,
+      "learning_rate": 5e-06,
+      "loss": 0.6064,
+      "step": 480
+    },
+    {
+      "epoch": 1.1604499703966844,
+      "grad_norm": 0.5655936542523597,
+      "learning_rate": 5e-06,
+      "loss": 0.5994,
+      "step": 490
+    },
+    {
+      "epoch": 1.1841326228537596,
+      "grad_norm": 0.7349705127084367,
+      "learning_rate": 5e-06,
+      "loss": 0.6097,
+      "step": 500
+    },
+    {
+      "epoch": 1.2078152753108349,
+      "grad_norm": 0.5763999652681121,
+      "learning_rate": 5e-06,
+      "loss": 0.6135,
+      "step": 510
+    },
+    {
+      "epoch": 1.2314979277679101,
+      "grad_norm": 0.6420586148204931,
+      "learning_rate": 5e-06,
+      "loss": 0.6132,
+      "step": 520
+    },
+    {
+      "epoch": 1.2551805802249851,
+      "grad_norm": 0.5788998804332586,
+      "learning_rate": 5e-06,
+      "loss": 0.6004,
+      "step": 530
+    },
+    {
+      "epoch": 1.2788632326820604,
+      "grad_norm": 0.536638979204567,
+      "learning_rate": 5e-06,
+      "loss": 0.5964,
+      "step": 540
+    },
+    {
+      "epoch": 1.3025458851391356,
+      "grad_norm": 0.555896270898685,
+      "learning_rate": 5e-06,
+      "loss": 0.6059,
+      "step": 550
+    },
+    {
+      "epoch": 1.3262285375962108,
+      "grad_norm": 0.523881024195525,
+      "learning_rate": 5e-06,
+      "loss": 0.6065,
+      "step": 560
+    },
+    {
+      "epoch": 1.349911190053286,
+      "grad_norm": 0.5750577354690338,
+      "learning_rate": 5e-06,
+      "loss": 0.6056,
+      "step": 570
+    },
+    {
+      "epoch": 1.373593842510361,
+      "grad_norm": 0.5341219344965924,
+      "learning_rate": 5e-06,
+      "loss": 0.6038,
+      "step": 580
+    },
+    {
+      "epoch": 1.3972764949674363,
+      "grad_norm": 0.53243695293744,
+      "learning_rate": 5e-06,
+      "loss": 0.6045,
+      "step": 590
+    },
+    {
+      "epoch": 1.4209591474245116,
+      "grad_norm": 0.5160793462220815,
+      "learning_rate": 5e-06,
+      "loss": 0.6109,
+      "step": 600
+    },
+    {
+      "epoch": 1.4446417998815868,
+      "grad_norm": 0.519786480340386,
+      "learning_rate": 5e-06,
+      "loss": 0.6047,
+      "step": 610
+    },
+    {
+      "epoch": 1.468324452338662,
+      "grad_norm": 0.5176998387507717,
+      "learning_rate": 5e-06,
+      "loss": 0.6025,
+      "step": 620
+    },
+    {
+      "epoch": 1.492007104795737,
+      "grad_norm": 0.5326374678084742,
+      "learning_rate": 5e-06,
+      "loss": 0.6058,
+      "step": 630
+    },
+    {
+      "epoch": 1.5156897572528123,
+      "grad_norm": 0.6814294915516854,
+      "learning_rate": 5e-06,
+      "loss": 0.6018,
+      "step": 640
+    },
+    {
+      "epoch": 1.5393724097098875,
+      "grad_norm": 0.5067893932415504,
+      "learning_rate": 5e-06,
+      "loss": 0.6093,
+      "step": 650
+    },
+    {
+      "epoch": 1.5630550621669625,
+      "grad_norm": 0.6391993779464403,
+      "learning_rate": 5e-06,
+      "loss": 0.6045,
+      "step": 660
+    },
+    {
+      "epoch": 1.586737714624038,
+      "grad_norm": 0.5670108575671168,
+      "learning_rate": 5e-06,
+      "loss": 0.6046,
+      "step": 670
+    },
+    {
+      "epoch": 1.610420367081113,
+      "grad_norm": 0.5200973369062145,
+      "learning_rate": 5e-06,
+      "loss": 0.6083,
+      "step": 680
+    },
+    {
+      "epoch": 1.6341030195381883,
+      "grad_norm": 0.49629609919819867,
+      "learning_rate": 5e-06,
+      "loss": 0.6102,
+      "step": 690
+    },
+    {
+      "epoch": 1.6577856719952635,
+      "grad_norm": 0.5411312975043618,
+      "learning_rate": 5e-06,
+      "loss": 0.6058,
+      "step": 700
+    },
+    {
+      "epoch": 1.6814683244523385,
+      "grad_norm": 0.6511065530477933,
+      "learning_rate": 5e-06,
+      "loss": 0.6061,
+      "step": 710
+    },
+    {
+      "epoch": 1.705150976909414,
+      "grad_norm": 0.5356040304164067,
+      "learning_rate": 5e-06,
+      "loss": 0.608,
+      "step": 720
+    },
+    {
+      "epoch": 1.728833629366489,
+      "grad_norm": 0.5044332661887174,
+      "learning_rate": 5e-06,
+      "loss": 0.6067,
+      "step": 730
+    },
+    {
+      "epoch": 1.7525162818235642,
+      "grad_norm": 0.5679268459481845,
+      "learning_rate": 5e-06,
+      "loss": 0.6044,
+      "step": 740
+    },
+    {
+      "epoch": 1.7761989342806395,
+      "grad_norm": 0.5041966525853055,
+      "learning_rate": 5e-06,
+      "loss": 0.6012,
+      "step": 750
+    },
+    {
+      "epoch": 1.7998815867377145,
+      "grad_norm": 0.622815173397146,
+      "learning_rate": 5e-06,
+      "loss": 0.6064,
+      "step": 760
+    },
+    {
+      "epoch": 1.82356423919479,
+      "grad_norm": 0.6105310434875696,
+      "learning_rate": 5e-06,
+      "loss": 0.5993,
+      "step": 770
+    },
+    {
+      "epoch": 1.847246891651865,
+      "grad_norm": 0.6478637621549127,
+      "learning_rate": 5e-06,
+      "loss": 0.6083,
+      "step": 780
+    },
+    {
+      "epoch": 1.8709295441089402,
+      "grad_norm": 0.5288770221223392,
+      "learning_rate": 5e-06,
+      "loss": 0.6064,
+      "step": 790
+    },
+    {
+      "epoch": 1.8946121965660154,
+      "grad_norm": 0.6310494125165369,
+      "learning_rate": 5e-06,
+      "loss": 0.6054,
+      "step": 800
+    },
+    {
+      "epoch": 1.9182948490230904,
+      "grad_norm": 0.6224434949310766,
+      "learning_rate": 5e-06,
+      "loss": 0.6014,
+      "step": 810
+    },
+    {
+      "epoch": 1.941977501480166,
+      "grad_norm": 0.5690732220009218,
+      "learning_rate": 5e-06,
+      "loss": 0.6026,
+      "step": 820
+    },
+    {
+      "epoch": 1.965660153937241,
+      "grad_norm": 0.5819642741301115,
+      "learning_rate": 5e-06,
+      "loss": 0.6056,
+      "step": 830
+    },
+    {
+      "epoch": 1.9893428063943162,
+      "grad_norm": 0.4867726663758881,
+      "learning_rate": 5e-06,
+      "loss": 0.6039,
+      "step": 840
+    },
+    {
+      "epoch": 1.9988158673771461,
+      "eval_loss": 0.6381043791770935,
+      "eval_runtime": 226.3666,
+      "eval_samples_per_second": 50.25,
+      "eval_steps_per_second": 0.393,
+      "step": 844
+    },
+    {
+      "epoch": 2.0130254588513914,
+      "grad_norm": 0.799137975988829,
+      "learning_rate": 5e-06,
+      "loss": 0.5789,
+      "step": 850
+    },
+    {
+      "epoch": 2.0367081113084664,
+      "grad_norm": 0.5651415058987638,
+      "learning_rate": 5e-06,
+      "loss": 0.5532,
+      "step": 860
+    },
+    {
+      "epoch": 2.060390763765542,
+      "grad_norm": 0.6785997009798781,
+      "learning_rate": 5e-06,
+      "loss": 0.5528,
+      "step": 870
+    },
+    {
+      "epoch": 2.084073416222617,
+      "grad_norm": 0.589434711398729,
+      "learning_rate": 5e-06,
+      "loss": 0.5581,
+      "step": 880
+    },
+    {
+      "epoch": 2.1077560686796923,
+      "grad_norm": 0.6120240073948606,
+      "learning_rate": 5e-06,
+      "loss": 0.5578,
+      "step": 890
+    },
+    {
+      "epoch": 2.1314387211367674,
+      "grad_norm": 0.5356757525158238,
+      "learning_rate": 5e-06,
+      "loss": 0.5564,
+      "step": 900
+    },
+    {
+      "epoch": 2.1551213735938424,
+      "grad_norm": 0.6532444755899165,
+      "learning_rate": 5e-06,
+      "loss": 0.5609,
+      "step": 910
+    },
+    {
+      "epoch": 2.178804026050918,
+      "grad_norm": 0.7762170235905984,
+      "learning_rate": 5e-06,
+      "loss": 0.5588,
+      "step": 920
+    },
+    {
+      "epoch": 2.202486678507993,
+      "grad_norm": 0.5110672320160398,
+      "learning_rate": 5e-06,
+      "loss": 0.5611,
+      "step": 930
+    },
+    {
+      "epoch": 2.2261693309650683,
+      "grad_norm": 0.5175930179811897,
+      "learning_rate": 5e-06,
+      "loss": 0.5592,
+      "step": 940
+    },
+    {
+      "epoch": 2.2498519834221433,
+      "grad_norm": 0.5781036208277457,
+      "learning_rate": 5e-06,
+      "loss": 0.5567,
+      "step": 950
+    },
+    {
+      "epoch": 2.2735346358792183,
+      "grad_norm": 0.638650838885228,
+      "learning_rate": 5e-06,
+      "loss": 0.5566,
+      "step": 960
+    },
+    {
+      "epoch": 2.297217288336294,
+      "grad_norm": 0.5928010322906232,
+      "learning_rate": 5e-06,
+      "loss": 0.5595,
+      "step": 970
+    },
+    {
+      "epoch": 2.320899940793369,
+      "grad_norm": 0.6471191913384154,
+      "learning_rate": 5e-06,
+      "loss": 0.5618,
+      "step": 980
+    },
+    {
+      "epoch": 2.3445825932504443,
+      "grad_norm": 0.5439480452105689,
+      "learning_rate": 5e-06,
+      "loss": 0.5623,
+      "step": 990
+    },
+    {
+      "epoch": 2.3682652457075193,
+      "grad_norm": 0.5616150107415377,
+      "learning_rate": 5e-06,
+      "loss": 0.559,
+      "step": 1000
+    },
+    {
+      "epoch": 2.3919478981645943,
+      "grad_norm": 0.6063862680614267,
+      "learning_rate": 5e-06,
+      "loss": 0.5584,
+      "step": 1010
+    },
+    {
+      "epoch": 2.4156305506216698,
+      "grad_norm": 0.6514652096821626,
+      "learning_rate": 5e-06,
+      "loss": 0.564,
+      "step": 1020
+    },
+    {
+      "epoch": 2.4393132030787448,
+      "grad_norm": 0.5978550068800899,
+      "learning_rate": 5e-06,
+      "loss": 0.5648,
+      "step": 1030
+    },
+    {
+      "epoch": 2.4629958555358202,
+      "grad_norm": 0.5389659551331341,
+      "learning_rate": 5e-06,
+      "loss": 0.557,
+      "step": 1040
+    },
+    {
+      "epoch": 2.4866785079928952,
+      "grad_norm": 0.49728611593057986,
+      "learning_rate": 5e-06,
+      "loss": 0.5563,
+      "step": 1050
+    },
+    {
+      "epoch": 2.5103611604499703,
+      "grad_norm": 0.5602610558768696,
+      "learning_rate": 5e-06,
+      "loss": 0.5628,
+      "step": 1060
+    },
+    {
+      "epoch": 2.5340438129070457,
+      "grad_norm": 0.6603901062892502,
+      "learning_rate": 5e-06,
+      "loss": 0.5576,
+      "step": 1070
+    },
+    {
+      "epoch": 2.5577264653641207,
+      "grad_norm": 0.6506359439942624,
+      "learning_rate": 5e-06,
+      "loss": 0.5567,
+      "step": 1080
+    },
+    {
+      "epoch": 2.581409117821196,
+      "grad_norm": 0.6081261618456771,
+      "learning_rate": 5e-06,
+      "loss": 0.5622,
+      "step": 1090
+    },
+    {
+      "epoch": 2.605091770278271,
+      "grad_norm": 0.5583840013876924,
+      "learning_rate": 5e-06,
+      "loss": 0.5641,
+      "step": 1100
+    },
+    {
+      "epoch": 2.6287744227353462,
+      "grad_norm": 0.677285075920703,
+      "learning_rate": 5e-06,
+      "loss": 0.5676,
+      "step": 1110
+    },
+    {
+      "epoch": 2.6524570751924217,
+      "grad_norm": 0.5283556684105596,
+      "learning_rate": 5e-06,
+      "loss": 0.5609,
+      "step": 1120
+    },
+    {
+      "epoch": 2.6761397276494967,
+      "grad_norm": 0.5373459018769577,
+      "learning_rate": 5e-06,
+      "loss": 0.5619,
+      "step": 1130
+    },
+    {
+      "epoch": 2.699822380106572,
+      "grad_norm": 0.5544732747898201,
+      "learning_rate": 5e-06,
+      "loss": 0.5577,
+      "step": 1140
+    },
+    {
+      "epoch": 2.723505032563647,
+      "grad_norm": 0.5842636221616775,
+      "learning_rate": 5e-06,
+      "loss": 0.5644,
+      "step": 1150
+    },
+    {
+      "epoch": 2.747187685020722,
+      "grad_norm": 0.5195933914819354,
+      "learning_rate": 5e-06,
+      "loss": 0.5669,
+      "step": 1160
+    },
+    {
+      "epoch": 2.7708703374777977,
+      "grad_norm": 0.6191768570542974,
+      "learning_rate": 5e-06,
+      "loss": 0.5647,
+      "step": 1170
+    },
+    {
+      "epoch": 2.7945529899348727,
+      "grad_norm": 0.5155223116029976,
+      "learning_rate": 5e-06,
+      "loss": 0.5681,
+      "step": 1180
+    },
+    {
+      "epoch": 2.818235642391948,
+      "grad_norm": 0.6371568852312499,
+      "learning_rate": 5e-06,
+      "loss": 0.5604,
+      "step": 1190
+    },
+    {
+      "epoch": 2.841918294849023,
+      "grad_norm": 0.6170324836320265,
+      "learning_rate": 5e-06,
+      "loss": 0.5634,
+      "step": 1200
+    },
+    {
+      "epoch": 2.865600947306098,
+      "grad_norm": 0.5196345014766655,
+      "learning_rate": 5e-06,
+      "loss": 0.5585,
+      "step": 1210
+    },
+    {
+      "epoch": 2.8892835997631736,
+      "grad_norm": 0.5582532842634907,
+      "learning_rate": 5e-06,
+      "loss": 0.5681,
+      "step": 1220
+    },
+    {
+      "epoch": 2.9129662522202486,
+      "grad_norm": 0.5370930409847251,
+      "learning_rate": 5e-06,
+      "loss": 0.5622,
+      "step": 1230
+    },
+    {
+      "epoch": 2.936648904677324,
+      "grad_norm": 0.5553114047738701,
+      "learning_rate": 5e-06,
+      "loss": 0.5698,
+      "step": 1240
+    },
+    {
+      "epoch": 2.960331557134399,
+      "grad_norm": 0.5109477407586265,
+      "learning_rate": 5e-06,
+      "loss": 0.5651,
+      "step": 1250
+    },
+    {
+      "epoch": 2.984014209591474,
+      "grad_norm": 0.535258137027712,
+      "learning_rate": 5e-06,
+      "loss": 0.5552,
+      "step": 1260
+    },
+    {
+      "epoch": 2.9982238010657194,
+      "eval_loss": 0.6410338282585144,
+      "eval_runtime": 226.7918,
+      "eval_samples_per_second": 50.156,
+      "eval_steps_per_second": 0.392,
+      "step": 1266
+    },
+    {
+      "epoch": 2.9982238010657194,
+      "step": 1266,
+      "total_flos": 2120178393415680.0,
+      "train_loss": 0.6170312019321026,
+      "train_runtime": 37849.9965,
+      "train_samples_per_second": 17.13,
+      "train_steps_per_second": 0.033
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1266,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2120178393415680.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed