dabrown commited on
Commit
5cc9a28
·
verified ·
1 Parent(s): caa0ef6

Training in progress, step 825, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8f7d50ca95c733455c27a0191698218c0fb7257f1f04a523e26b9cb95be94420
3
  size 80792096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78d6a93ae40c764a93d8bf10457115738f1676a06e0b7c4f70ba8ed147965951
3
  size 80792096
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1a3a0295315f0ad2885ad03f8acc2f49097e143fa65061bf55855900307c3d3c
3
  size 41460084
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:279d695a5dae9627f3e5f5746ada16638db4ebf932e04b1a84afe8716367c20f
3
  size 41460084
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b5d154d045ee189af4c648f80535098cfde6139351de9c4d32c890f904602cee
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37ed957e35db52d753e90b2e89a572c6b011e6c971890858f14f12a4305efd1f
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:789cac547d76905ddc88036180d9f246f307a104c94da93e131a174052f790e8
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:797085a729e10588f0af8dfcea7980f4fc8438c6de826417968959a62c5bdc9a
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5006257822277848,
5
  "eval_steps": 275,
6
- "global_step": 550,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3881,6 +3881,1939 @@
3881
  "eval_samples_per_second": 9.076,
3882
  "eval_steps_per_second": 4.541,
3883
  "step": 550
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3884
  }
3885
  ],
3886
  "logging_steps": 1,
@@ -3900,7 +5833,7 @@
3900
  "attributes": {}
3901
  }
3902
  },
3903
- "total_flos": 1.0074178982447677e+18,
3904
  "train_batch_size": 2,
3905
  "trial_name": null,
3906
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.7509386733416771,
5
  "eval_steps": 275,
6
+ "global_step": 825,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3881
  "eval_samples_per_second": 9.076,
3882
  "eval_steps_per_second": 4.541,
3883
  "step": 550
3884
+ },
3885
+ {
3886
+ "epoch": 0.5015360109227444,
3887
+ "grad_norm": 0.44648221135139465,
3888
+ "learning_rate": 0.00010100967745256766,
3889
+ "loss": 2.4608,
3890
+ "step": 551
3891
+ },
3892
+ {
3893
+ "epoch": 0.502446239617704,
3894
+ "grad_norm": 0.4537757635116577,
3895
+ "learning_rate": 0.00010072120418249745,
3896
+ "loss": 2.3225,
3897
+ "step": 552
3898
+ },
3899
+ {
3900
+ "epoch": 0.5033564683126636,
3901
+ "grad_norm": 0.45517390966415405,
3902
+ "learning_rate": 0.00010043272491034523,
3903
+ "loss": 2.4937,
3904
+ "step": 553
3905
+ },
3906
+ {
3907
+ "epoch": 0.5042666970076232,
3908
+ "grad_norm": 0.4309350848197937,
3909
+ "learning_rate": 0.00010014424203692388,
3910
+ "loss": 2.3752,
3911
+ "step": 554
3912
+ },
3913
+ {
3914
+ "epoch": 0.5051769257025828,
3915
+ "grad_norm": 0.4162474274635315,
3916
+ "learning_rate": 9.985575796307615e-05,
3917
+ "loss": 2.356,
3918
+ "step": 555
3919
+ },
3920
+ {
3921
+ "epoch": 0.5060871543975424,
3922
+ "grad_norm": 0.454484224319458,
3923
+ "learning_rate": 9.956727508965481e-05,
3924
+ "loss": 2.3114,
3925
+ "step": 556
3926
+ },
3927
+ {
3928
+ "epoch": 0.506997383092502,
3929
+ "grad_norm": 0.4174659550189972,
3930
+ "learning_rate": 9.927879581750259e-05,
3931
+ "loss": 2.2907,
3932
+ "step": 557
3933
+ },
3934
+ {
3935
+ "epoch": 0.5079076117874616,
3936
+ "grad_norm": 0.42208272218704224,
3937
+ "learning_rate": 9.899032254743235e-05,
3938
+ "loss": 2.3062,
3939
+ "step": 558
3940
+ },
3941
+ {
3942
+ "epoch": 0.5088178404824212,
3943
+ "grad_norm": 0.41622260212898254,
3944
+ "learning_rate": 9.870185768020693e-05,
3945
+ "loss": 2.3293,
3946
+ "step": 559
3947
+ },
3948
+ {
3949
+ "epoch": 0.5097280691773808,
3950
+ "grad_norm": 0.45536091923713684,
3951
+ "learning_rate": 9.84134036165192e-05,
3952
+ "loss": 2.2693,
3953
+ "step": 560
3954
+ },
3955
+ {
3956
+ "epoch": 0.5106382978723404,
3957
+ "grad_norm": 0.3866654634475708,
3958
+ "learning_rate": 9.812496275697226e-05,
3959
+ "loss": 2.1254,
3960
+ "step": 561
3961
+ },
3962
+ {
3963
+ "epoch": 0.5115485265673,
3964
+ "grad_norm": 0.41163724660873413,
3965
+ "learning_rate": 9.783653750205915e-05,
3966
+ "loss": 2.2173,
3967
+ "step": 562
3968
+ },
3969
+ {
3970
+ "epoch": 0.5124587552622596,
3971
+ "grad_norm": 0.4118368327617645,
3972
+ "learning_rate": 9.754813025214317e-05,
3973
+ "loss": 2.2477,
3974
+ "step": 563
3975
+ },
3976
+ {
3977
+ "epoch": 0.5133689839572193,
3978
+ "grad_norm": 0.4303027093410492,
3979
+ "learning_rate": 9.725974340743769e-05,
3980
+ "loss": 2.3855,
3981
+ "step": 564
3982
+ },
3983
+ {
3984
+ "epoch": 0.5142792126521789,
3985
+ "grad_norm": 0.41484424471855164,
3986
+ "learning_rate": 9.697137936798634e-05,
3987
+ "loss": 2.289,
3988
+ "step": 565
3989
+ },
3990
+ {
3991
+ "epoch": 0.5151894413471385,
3992
+ "grad_norm": 0.397540807723999,
3993
+ "learning_rate": 9.668304053364294e-05,
3994
+ "loss": 2.1869,
3995
+ "step": 566
3996
+ },
3997
+ {
3998
+ "epoch": 0.5160996700420981,
3999
+ "grad_norm": 0.38194504380226135,
4000
+ "learning_rate": 9.639472930405143e-05,
4001
+ "loss": 2.1475,
4002
+ "step": 567
4003
+ },
4004
+ {
4005
+ "epoch": 0.5170098987370577,
4006
+ "grad_norm": 0.42346352338790894,
4007
+ "learning_rate": 9.610644807862625e-05,
4008
+ "loss": 2.1869,
4009
+ "step": 568
4010
+ },
4011
+ {
4012
+ "epoch": 0.5179201274320173,
4013
+ "grad_norm": 0.4076716899871826,
4014
+ "learning_rate": 9.581819925653188e-05,
4015
+ "loss": 2.1979,
4016
+ "step": 569
4017
+ },
4018
+ {
4019
+ "epoch": 0.5188303561269769,
4020
+ "grad_norm": 0.3888402581214905,
4021
+ "learning_rate": 9.552998523666326e-05,
4022
+ "loss": 2.131,
4023
+ "step": 570
4024
+ },
4025
+ {
4026
+ "epoch": 0.5197405848219365,
4027
+ "grad_norm": 0.3931845426559448,
4028
+ "learning_rate": 9.524180841762577e-05,
4029
+ "loss": 2.07,
4030
+ "step": 571
4031
+ },
4032
+ {
4033
+ "epoch": 0.5206508135168961,
4034
+ "grad_norm": 0.38014844059944153,
4035
+ "learning_rate": 9.495367119771503e-05,
4036
+ "loss": 1.8905,
4037
+ "step": 572
4038
+ },
4039
+ {
4040
+ "epoch": 0.5215610422118557,
4041
+ "grad_norm": 0.4095986783504486,
4042
+ "learning_rate": 9.46655759748972e-05,
4043
+ "loss": 2.1217,
4044
+ "step": 573
4045
+ },
4046
+ {
4047
+ "epoch": 0.5224712709068153,
4048
+ "grad_norm": 0.3965059518814087,
4049
+ "learning_rate": 9.437752514678887e-05,
4050
+ "loss": 2.0446,
4051
+ "step": 574
4052
+ },
4053
+ {
4054
+ "epoch": 0.5233814996017749,
4055
+ "grad_norm": 0.42244595289230347,
4056
+ "learning_rate": 9.408952111063727e-05,
4057
+ "loss": 2.1184,
4058
+ "step": 575
4059
+ },
4060
+ {
4061
+ "epoch": 0.5242917282967345,
4062
+ "grad_norm": 0.4098374843597412,
4063
+ "learning_rate": 9.380156626330009e-05,
4064
+ "loss": 2.0365,
4065
+ "step": 576
4066
+ },
4067
+ {
4068
+ "epoch": 0.5252019569916941,
4069
+ "grad_norm": 0.424630731344223,
4070
+ "learning_rate": 9.35136630012257e-05,
4071
+ "loss": 2.1321,
4072
+ "step": 577
4073
+ },
4074
+ {
4075
+ "epoch": 0.5261121856866537,
4076
+ "grad_norm": 0.431257039308548,
4077
+ "learning_rate": 9.322581372043321e-05,
4078
+ "loss": 2.1883,
4079
+ "step": 578
4080
+ },
4081
+ {
4082
+ "epoch": 0.5270224143816133,
4083
+ "grad_norm": 0.40972310304641724,
4084
+ "learning_rate": 9.293802081649243e-05,
4085
+ "loss": 2.0498,
4086
+ "step": 579
4087
+ },
4088
+ {
4089
+ "epoch": 0.527932643076573,
4090
+ "grad_norm": 0.4238174855709076,
4091
+ "learning_rate": 9.265028668450402e-05,
4092
+ "loss": 2.0831,
4093
+ "step": 580
4094
+ },
4095
+ {
4096
+ "epoch": 0.5288428717715326,
4097
+ "grad_norm": 0.43082720041275024,
4098
+ "learning_rate": 9.23626137190794e-05,
4099
+ "loss": 2.1704,
4100
+ "step": 581
4101
+ },
4102
+ {
4103
+ "epoch": 0.5297531004664922,
4104
+ "grad_norm": 0.4657386541366577,
4105
+ "learning_rate": 9.207500431432115e-05,
4106
+ "loss": 2.1338,
4107
+ "step": 582
4108
+ },
4109
+ {
4110
+ "epoch": 0.5306633291614519,
4111
+ "grad_norm": 0.45546218752861023,
4112
+ "learning_rate": 9.178746086380275e-05,
4113
+ "loss": 2.1469,
4114
+ "step": 583
4115
+ },
4116
+ {
4117
+ "epoch": 0.5315735578564115,
4118
+ "grad_norm": 0.4571657180786133,
4119
+ "learning_rate": 9.149998576054874e-05,
4120
+ "loss": 2.2036,
4121
+ "step": 584
4122
+ },
4123
+ {
4124
+ "epoch": 0.5324837865513711,
4125
+ "grad_norm": 0.4736308753490448,
4126
+ "learning_rate": 9.121258139701502e-05,
4127
+ "loss": 2.2127,
4128
+ "step": 585
4129
+ },
4130
+ {
4131
+ "epoch": 0.5333940152463307,
4132
+ "grad_norm": 0.46164044737815857,
4133
+ "learning_rate": 9.092525016506858e-05,
4134
+ "loss": 2.1177,
4135
+ "step": 586
4136
+ },
4137
+ {
4138
+ "epoch": 0.5343042439412903,
4139
+ "grad_norm": 0.48902738094329834,
4140
+ "learning_rate": 9.063799445596795e-05,
4141
+ "loss": 2.2188,
4142
+ "step": 587
4143
+ },
4144
+ {
4145
+ "epoch": 0.5352144726362499,
4146
+ "grad_norm": 0.4848826825618744,
4147
+ "learning_rate": 9.035081666034304e-05,
4148
+ "loss": 2.2372,
4149
+ "step": 588
4150
+ },
4151
+ {
4152
+ "epoch": 0.5361247013312095,
4153
+ "grad_norm": 0.49447596073150635,
4154
+ "learning_rate": 9.006371916817534e-05,
4155
+ "loss": 2.2786,
4156
+ "step": 589
4157
+ },
4158
+ {
4159
+ "epoch": 0.5370349300261691,
4160
+ "grad_norm": 0.45803365111351013,
4161
+ "learning_rate": 8.977670436877811e-05,
4162
+ "loss": 2.0856,
4163
+ "step": 590
4164
+ },
4165
+ {
4166
+ "epoch": 0.5379451587211287,
4167
+ "grad_norm": 0.5051137208938599,
4168
+ "learning_rate": 8.948977465077632e-05,
4169
+ "loss": 2.2219,
4170
+ "step": 591
4171
+ },
4172
+ {
4173
+ "epoch": 0.5388553874160883,
4174
+ "grad_norm": 0.5000495910644531,
4175
+ "learning_rate": 8.920293240208694e-05,
4176
+ "loss": 2.2151,
4177
+ "step": 592
4178
+ },
4179
+ {
4180
+ "epoch": 0.5397656161110479,
4181
+ "grad_norm": 0.5208660960197449,
4182
+ "learning_rate": 8.891618000989891e-05,
4183
+ "loss": 2.3381,
4184
+ "step": 593
4185
+ },
4186
+ {
4187
+ "epoch": 0.5406758448060075,
4188
+ "grad_norm": 0.5478562116622925,
4189
+ "learning_rate": 8.862951986065345e-05,
4190
+ "loss": 2.1592,
4191
+ "step": 594
4192
+ },
4193
+ {
4194
+ "epoch": 0.5415860735009671,
4195
+ "grad_norm": 0.5808135271072388,
4196
+ "learning_rate": 8.83429543400241e-05,
4197
+ "loss": 2.3678,
4198
+ "step": 595
4199
+ },
4200
+ {
4201
+ "epoch": 0.5424963021959267,
4202
+ "grad_norm": 0.5837730765342712,
4203
+ "learning_rate": 8.805648583289674e-05,
4204
+ "loss": 2.3397,
4205
+ "step": 596
4206
+ },
4207
+ {
4208
+ "epoch": 0.5434065308908863,
4209
+ "grad_norm": 0.6297701597213745,
4210
+ "learning_rate": 8.777011672335008e-05,
4211
+ "loss": 2.4747,
4212
+ "step": 597
4213
+ },
4214
+ {
4215
+ "epoch": 0.5443167595858459,
4216
+ "grad_norm": 0.735091507434845,
4217
+ "learning_rate": 8.748384939463543e-05,
4218
+ "loss": 2.5623,
4219
+ "step": 598
4220
+ },
4221
+ {
4222
+ "epoch": 0.5452269882808055,
4223
+ "grad_norm": 0.8794751167297363,
4224
+ "learning_rate": 8.719768622915714e-05,
4225
+ "loss": 2.557,
4226
+ "step": 599
4227
+ },
4228
+ {
4229
+ "epoch": 0.5461372169757651,
4230
+ "grad_norm": 1.7257400751113892,
4231
+ "learning_rate": 8.691162960845264e-05,
4232
+ "loss": 2.7309,
4233
+ "step": 600
4234
+ },
4235
+ {
4236
+ "epoch": 0.5470474456707247,
4237
+ "grad_norm": 0.42087942361831665,
4238
+ "learning_rate": 8.662568191317273e-05,
4239
+ "loss": 2.3698,
4240
+ "step": 601
4241
+ },
4242
+ {
4243
+ "epoch": 0.5479576743656844,
4244
+ "grad_norm": 0.45860013365745544,
4245
+ "learning_rate": 8.633984552306164e-05,
4246
+ "loss": 2.4207,
4247
+ "step": 602
4248
+ },
4249
+ {
4250
+ "epoch": 0.548867903060644,
4251
+ "grad_norm": 0.4520268738269806,
4252
+ "learning_rate": 8.605412281693727e-05,
4253
+ "loss": 2.5066,
4254
+ "step": 603
4255
+ },
4256
+ {
4257
+ "epoch": 0.5497781317556036,
4258
+ "grad_norm": 0.42296549677848816,
4259
+ "learning_rate": 8.57685161726715e-05,
4260
+ "loss": 2.3086,
4261
+ "step": 604
4262
+ },
4263
+ {
4264
+ "epoch": 0.5506883604505632,
4265
+ "grad_norm": 0.4488202631473541,
4266
+ "learning_rate": 8.548302796717019e-05,
4267
+ "loss": 2.3949,
4268
+ "step": 605
4269
+ },
4270
+ {
4271
+ "epoch": 0.5515985891455228,
4272
+ "grad_norm": 0.43144237995147705,
4273
+ "learning_rate": 8.519766057635355e-05,
4274
+ "loss": 2.3862,
4275
+ "step": 606
4276
+ },
4277
+ {
4278
+ "epoch": 0.5525088178404824,
4279
+ "grad_norm": 0.43359822034835815,
4280
+ "learning_rate": 8.491241637513644e-05,
4281
+ "loss": 2.2576,
4282
+ "step": 607
4283
+ },
4284
+ {
4285
+ "epoch": 0.553419046535442,
4286
+ "grad_norm": 0.41321370005607605,
4287
+ "learning_rate": 8.462729773740832e-05,
4288
+ "loss": 2.294,
4289
+ "step": 608
4290
+ },
4291
+ {
4292
+ "epoch": 0.5543292752304017,
4293
+ "grad_norm": 0.4104655981063843,
4294
+ "learning_rate": 8.434230703601384e-05,
4295
+ "loss": 2.2005,
4296
+ "step": 609
4297
+ },
4298
+ {
4299
+ "epoch": 0.5552395039253613,
4300
+ "grad_norm": 0.4300507605075836,
4301
+ "learning_rate": 8.405744664273278e-05,
4302
+ "loss": 2.4242,
4303
+ "step": 610
4304
+ },
4305
+ {
4306
+ "epoch": 0.5561497326203209,
4307
+ "grad_norm": 0.42208558320999146,
4308
+ "learning_rate": 8.37727189282606e-05,
4309
+ "loss": 2.2788,
4310
+ "step": 611
4311
+ },
4312
+ {
4313
+ "epoch": 0.5570599613152805,
4314
+ "grad_norm": 0.4338865280151367,
4315
+ "learning_rate": 8.34881262621884e-05,
4316
+ "loss": 2.4823,
4317
+ "step": 612
4318
+ },
4319
+ {
4320
+ "epoch": 0.5579701900102401,
4321
+ "grad_norm": 0.399996280670166,
4322
+ "learning_rate": 8.320367101298351e-05,
4323
+ "loss": 2.1704,
4324
+ "step": 613
4325
+ },
4326
+ {
4327
+ "epoch": 0.5588804187051997,
4328
+ "grad_norm": 0.4285057783126831,
4329
+ "learning_rate": 8.291935554796962e-05,
4330
+ "loss": 2.3405,
4331
+ "step": 614
4332
+ },
4333
+ {
4334
+ "epoch": 0.5597906474001593,
4335
+ "grad_norm": 0.41062313318252563,
4336
+ "learning_rate": 8.263518223330697e-05,
4337
+ "loss": 2.2424,
4338
+ "step": 615
4339
+ },
4340
+ {
4341
+ "epoch": 0.5607008760951189,
4342
+ "grad_norm": 0.40377363562583923,
4343
+ "learning_rate": 8.235115343397295e-05,
4344
+ "loss": 2.2611,
4345
+ "step": 616
4346
+ },
4347
+ {
4348
+ "epoch": 0.5616111047900785,
4349
+ "grad_norm": 0.38683268427848816,
4350
+ "learning_rate": 8.206727151374207e-05,
4351
+ "loss": 2.0895,
4352
+ "step": 617
4353
+ },
4354
+ {
4355
+ "epoch": 0.5625213334850381,
4356
+ "grad_norm": 0.3813983201980591,
4357
+ "learning_rate": 8.178353883516664e-05,
4358
+ "loss": 2.0735,
4359
+ "step": 618
4360
+ },
4361
+ {
4362
+ "epoch": 0.5634315621799977,
4363
+ "grad_norm": 0.40514618158340454,
4364
+ "learning_rate": 8.149995775955686e-05,
4365
+ "loss": 2.2261,
4366
+ "step": 619
4367
+ },
4368
+ {
4369
+ "epoch": 0.5643417908749573,
4370
+ "grad_norm": 0.3858005106449127,
4371
+ "learning_rate": 8.121653064696118e-05,
4372
+ "loss": 2.0794,
4373
+ "step": 620
4374
+ },
4375
+ {
4376
+ "epoch": 0.565252019569917,
4377
+ "grad_norm": 0.4032715857028961,
4378
+ "learning_rate": 8.093325985614685e-05,
4379
+ "loss": 2.2016,
4380
+ "step": 621
4381
+ },
4382
+ {
4383
+ "epoch": 0.5661622482648766,
4384
+ "grad_norm": 0.40262308716773987,
4385
+ "learning_rate": 8.065014774458003e-05,
4386
+ "loss": 2.1193,
4387
+ "step": 622
4388
+ },
4389
+ {
4390
+ "epoch": 0.5670724769598362,
4391
+ "grad_norm": 0.4016035497188568,
4392
+ "learning_rate": 8.036719666840647e-05,
4393
+ "loss": 2.0265,
4394
+ "step": 623
4395
+ },
4396
+ {
4397
+ "epoch": 0.5679827056547958,
4398
+ "grad_norm": 0.42149612307548523,
4399
+ "learning_rate": 8.008440898243149e-05,
4400
+ "loss": 2.1186,
4401
+ "step": 624
4402
+ },
4403
+ {
4404
+ "epoch": 0.5688929343497554,
4405
+ "grad_norm": 0.40055879950523376,
4406
+ "learning_rate": 7.980178704010089e-05,
4407
+ "loss": 2.0066,
4408
+ "step": 625
4409
+ },
4410
+ {
4411
+ "epoch": 0.569803163044715,
4412
+ "grad_norm": 0.40722012519836426,
4413
+ "learning_rate": 7.951933319348095e-05,
4414
+ "loss": 2.0262,
4415
+ "step": 626
4416
+ },
4417
+ {
4418
+ "epoch": 0.5707133917396746,
4419
+ "grad_norm": 0.4256587624549866,
4420
+ "learning_rate": 7.923704979323899e-05,
4421
+ "loss": 2.0765,
4422
+ "step": 627
4423
+ },
4424
+ {
4425
+ "epoch": 0.5716236204346342,
4426
+ "grad_norm": 0.452120840549469,
4427
+ "learning_rate": 7.895493918862396e-05,
4428
+ "loss": 2.2261,
4429
+ "step": 628
4430
+ },
4431
+ {
4432
+ "epoch": 0.5725338491295938,
4433
+ "grad_norm": 0.45264679193496704,
4434
+ "learning_rate": 7.867300372744657e-05,
4435
+ "loss": 2.1382,
4436
+ "step": 629
4437
+ },
4438
+ {
4439
+ "epoch": 0.5734440778245534,
4440
+ "grad_norm": 0.4367516040802002,
4441
+ "learning_rate": 7.839124575606004e-05,
4442
+ "loss": 2.1152,
4443
+ "step": 630
4444
+ },
4445
+ {
4446
+ "epoch": 0.574354306519513,
4447
+ "grad_norm": 0.41711944341659546,
4448
+ "learning_rate": 7.810966761934053e-05,
4449
+ "loss": 2.0499,
4450
+ "step": 631
4451
+ },
4452
+ {
4453
+ "epoch": 0.5752645352144726,
4454
+ "grad_norm": 0.4274366497993469,
4455
+ "learning_rate": 7.782827166066739e-05,
4456
+ "loss": 2.0843,
4457
+ "step": 632
4458
+ },
4459
+ {
4460
+ "epoch": 0.5761747639094322,
4461
+ "grad_norm": 0.4412054717540741,
4462
+ "learning_rate": 7.754706022190398e-05,
4463
+ "loss": 2.1291,
4464
+ "step": 633
4465
+ },
4466
+ {
4467
+ "epoch": 0.5770849926043918,
4468
+ "grad_norm": 0.43048036098480225,
4469
+ "learning_rate": 7.726603564337791e-05,
4470
+ "loss": 2.0492,
4471
+ "step": 634
4472
+ },
4473
+ {
4474
+ "epoch": 0.5779952212993514,
4475
+ "grad_norm": 0.4570690393447876,
4476
+ "learning_rate": 7.69852002638618e-05,
4477
+ "loss": 2.2868,
4478
+ "step": 635
4479
+ },
4480
+ {
4481
+ "epoch": 0.578905449994311,
4482
+ "grad_norm": 0.4373176097869873,
4483
+ "learning_rate": 7.670455642055361e-05,
4484
+ "loss": 2.106,
4485
+ "step": 636
4486
+ },
4487
+ {
4488
+ "epoch": 0.5798156786892706,
4489
+ "grad_norm": 0.46953141689300537,
4490
+ "learning_rate": 7.642410644905726e-05,
4491
+ "loss": 2.2209,
4492
+ "step": 637
4493
+ },
4494
+ {
4495
+ "epoch": 0.5807259073842302,
4496
+ "grad_norm": 0.48714450001716614,
4497
+ "learning_rate": 7.614385268336336e-05,
4498
+ "loss": 2.2923,
4499
+ "step": 638
4500
+ },
4501
+ {
4502
+ "epoch": 0.5816361360791898,
4503
+ "grad_norm": 0.45921769738197327,
4504
+ "learning_rate": 7.586379745582944e-05,
4505
+ "loss": 2.1636,
4506
+ "step": 639
4507
+ },
4508
+ {
4509
+ "epoch": 0.5825463647741494,
4510
+ "grad_norm": 0.4652685821056366,
4511
+ "learning_rate": 7.558394309716088e-05,
4512
+ "loss": 2.205,
4513
+ "step": 640
4514
+ },
4515
+ {
4516
+ "epoch": 0.5834565934691092,
4517
+ "grad_norm": 0.48991554975509644,
4518
+ "learning_rate": 7.530429193639128e-05,
4519
+ "loss": 2.1805,
4520
+ "step": 641
4521
+ },
4522
+ {
4523
+ "epoch": 0.5843668221640688,
4524
+ "grad_norm": 0.515925407409668,
4525
+ "learning_rate": 7.502484630086318e-05,
4526
+ "loss": 2.2075,
4527
+ "step": 642
4528
+ },
4529
+ {
4530
+ "epoch": 0.5852770508590284,
4531
+ "grad_norm": 0.497593492269516,
4532
+ "learning_rate": 7.474560851620873e-05,
4533
+ "loss": 2.0497,
4534
+ "step": 643
4535
+ },
4536
+ {
4537
+ "epoch": 0.586187279553988,
4538
+ "grad_norm": 0.5590106248855591,
4539
+ "learning_rate": 7.446658090633026e-05,
4540
+ "loss": 2.273,
4541
+ "step": 644
4542
+ },
4543
+ {
4544
+ "epoch": 0.5870975082489476,
4545
+ "grad_norm": 0.570801854133606,
4546
+ "learning_rate": 7.41877657933809e-05,
4547
+ "loss": 2.4023,
4548
+ "step": 645
4549
+ },
4550
+ {
4551
+ "epoch": 0.5880077369439072,
4552
+ "grad_norm": 0.5791385173797607,
4553
+ "learning_rate": 7.390916549774536e-05,
4554
+ "loss": 2.2391,
4555
+ "step": 646
4556
+ },
4557
+ {
4558
+ "epoch": 0.5889179656388668,
4559
+ "grad_norm": 0.677937924861908,
4560
+ "learning_rate": 7.363078233802063e-05,
4561
+ "loss": 2.6502,
4562
+ "step": 647
4563
+ },
4564
+ {
4565
+ "epoch": 0.5898281943338264,
4566
+ "grad_norm": 0.6995664238929749,
4567
+ "learning_rate": 7.335261863099651e-05,
4568
+ "loss": 2.4716,
4569
+ "step": 648
4570
+ },
4571
+ {
4572
+ "epoch": 0.590738423028786,
4573
+ "grad_norm": 0.8106637597084045,
4574
+ "learning_rate": 7.307467669163655e-05,
4575
+ "loss": 2.3574,
4576
+ "step": 649
4577
+ },
4578
+ {
4579
+ "epoch": 0.5916486517237456,
4580
+ "grad_norm": 1.577392816543579,
4581
+ "learning_rate": 7.279695883305866e-05,
4582
+ "loss": 2.1657,
4583
+ "step": 650
4584
+ },
4585
+ {
4586
+ "epoch": 0.5925588804187052,
4587
+ "grad_norm": 0.43565502762794495,
4588
+ "learning_rate": 7.251946736651582e-05,
4589
+ "loss": 2.4673,
4590
+ "step": 651
4591
+ },
4592
+ {
4593
+ "epoch": 0.5934691091136648,
4594
+ "grad_norm": 0.42318448424339294,
4595
+ "learning_rate": 7.224220460137701e-05,
4596
+ "loss": 2.409,
4597
+ "step": 652
4598
+ },
4599
+ {
4600
+ "epoch": 0.5943793378086244,
4601
+ "grad_norm": 0.41891592741012573,
4602
+ "learning_rate": 7.196517284510773e-05,
4603
+ "loss": 2.3842,
4604
+ "step": 653
4605
+ },
4606
+ {
4607
+ "epoch": 0.595289566503584,
4608
+ "grad_norm": 0.4418095350265503,
4609
+ "learning_rate": 7.168837440325114e-05,
4610
+ "loss": 2.3998,
4611
+ "step": 654
4612
+ },
4613
+ {
4614
+ "epoch": 0.5961997951985436,
4615
+ "grad_norm": 0.43118715286254883,
4616
+ "learning_rate": 7.141181157940859e-05,
4617
+ "loss": 2.3845,
4618
+ "step": 655
4619
+ },
4620
+ {
4621
+ "epoch": 0.5971100238935032,
4622
+ "grad_norm": 0.41435372829437256,
4623
+ "learning_rate": 7.11354866752205e-05,
4624
+ "loss": 2.3077,
4625
+ "step": 656
4626
+ },
4627
+ {
4628
+ "epoch": 0.5980202525884628,
4629
+ "grad_norm": 0.4133334457874298,
4630
+ "learning_rate": 7.085940199034735e-05,
4631
+ "loss": 2.385,
4632
+ "step": 657
4633
+ },
4634
+ {
4635
+ "epoch": 0.5989304812834224,
4636
+ "grad_norm": 0.4240586757659912,
4637
+ "learning_rate": 7.058355982245037e-05,
4638
+ "loss": 2.3835,
4639
+ "step": 658
4640
+ },
4641
+ {
4642
+ "epoch": 0.599840709978382,
4643
+ "grad_norm": 0.4071354568004608,
4644
+ "learning_rate": 7.030796246717255e-05,
4645
+ "loss": 2.0752,
4646
+ "step": 659
4647
+ },
4648
+ {
4649
+ "epoch": 0.6007509386733417,
4650
+ "grad_norm": 0.44435110688209534,
4651
+ "learning_rate": 7.003261221811934e-05,
4652
+ "loss": 2.4811,
4653
+ "step": 660
4654
+ },
4655
+ {
4656
+ "epoch": 0.6016611673683013,
4657
+ "grad_norm": 0.4175739884376526,
4658
+ "learning_rate": 6.97575113668399e-05,
4659
+ "loss": 2.3717,
4660
+ "step": 661
4661
+ },
4662
+ {
4663
+ "epoch": 0.6025713960632609,
4664
+ "grad_norm": 0.41801995038986206,
4665
+ "learning_rate": 6.948266220280771e-05,
4666
+ "loss": 2.3406,
4667
+ "step": 662
4668
+ },
4669
+ {
4670
+ "epoch": 0.6034816247582205,
4671
+ "grad_norm": 0.42795541882514954,
4672
+ "learning_rate": 6.920806701340155e-05,
4673
+ "loss": 2.1457,
4674
+ "step": 663
4675
+ },
4676
+ {
4677
+ "epoch": 0.6043918534531801,
4678
+ "grad_norm": 0.39928749203681946,
4679
+ "learning_rate": 6.893372808388675e-05,
4680
+ "loss": 2.2435,
4681
+ "step": 664
4682
+ },
4683
+ {
4684
+ "epoch": 0.6053020821481397,
4685
+ "grad_norm": 0.39942193031311035,
4686
+ "learning_rate": 6.865964769739575e-05,
4687
+ "loss": 2.1791,
4688
+ "step": 665
4689
+ },
4690
+ {
4691
+ "epoch": 0.6062123108430993,
4692
+ "grad_norm": 0.3968772888183594,
4693
+ "learning_rate": 6.838582813490947e-05,
4694
+ "loss": 2.106,
4695
+ "step": 666
4696
+ },
4697
+ {
4698
+ "epoch": 0.607122539538059,
4699
+ "grad_norm": 0.41822007298469543,
4700
+ "learning_rate": 6.811227167523815e-05,
4701
+ "loss": 2.2538,
4702
+ "step": 667
4703
+ },
4704
+ {
4705
+ "epoch": 0.6080327682330186,
4706
+ "grad_norm": 0.39328843355178833,
4707
+ "learning_rate": 6.783898059500233e-05,
4708
+ "loss": 2.1372,
4709
+ "step": 668
4710
+ },
4711
+ {
4712
+ "epoch": 0.6089429969279782,
4713
+ "grad_norm": 0.39368972182273865,
4714
+ "learning_rate": 6.756595716861407e-05,
4715
+ "loss": 2.1002,
4716
+ "step": 669
4717
+ },
4718
+ {
4719
+ "epoch": 0.6098532256229378,
4720
+ "grad_norm": 0.41033080220222473,
4721
+ "learning_rate": 6.729320366825784e-05,
4722
+ "loss": 2.0963,
4723
+ "step": 670
4724
+ },
4725
+ {
4726
+ "epoch": 0.6107634543178974,
4727
+ "grad_norm": 0.40454182028770447,
4728
+ "learning_rate": 6.702072236387182e-05,
4729
+ "loss": 2.0902,
4730
+ "step": 671
4731
+ },
4732
+ {
4733
+ "epoch": 0.611673683012857,
4734
+ "grad_norm": 0.40117865800857544,
4735
+ "learning_rate": 6.674851552312878e-05,
4736
+ "loss": 2.086,
4737
+ "step": 672
4738
+ },
4739
+ {
4740
+ "epoch": 0.6125839117078166,
4741
+ "grad_norm": 0.39771145582199097,
4742
+ "learning_rate": 6.647658541141735e-05,
4743
+ "loss": 1.977,
4744
+ "step": 673
4745
+ },
4746
+ {
4747
+ "epoch": 0.6134941404027762,
4748
+ "grad_norm": 0.4230089485645294,
4749
+ "learning_rate": 6.620493429182323e-05,
4750
+ "loss": 2.1687,
4751
+ "step": 674
4752
+ },
4753
+ {
4754
+ "epoch": 0.6144043690977358,
4755
+ "grad_norm": 0.40441179275512695,
4756
+ "learning_rate": 6.593356442511015e-05,
4757
+ "loss": 2.1608,
4758
+ "step": 675
4759
+ },
4760
+ {
4761
+ "epoch": 0.6153145977926954,
4762
+ "grad_norm": 0.40813207626342773,
4763
+ "learning_rate": 6.566247806970119e-05,
4764
+ "loss": 2.0103,
4765
+ "step": 676
4766
+ },
4767
+ {
4768
+ "epoch": 0.616224826487655,
4769
+ "grad_norm": 0.4276919364929199,
4770
+ "learning_rate": 6.539167748165994e-05,
4771
+ "loss": 2.0024,
4772
+ "step": 677
4773
+ },
4774
+ {
4775
+ "epoch": 0.6171350551826146,
4776
+ "grad_norm": 0.4275762140750885,
4777
+ "learning_rate": 6.512116491467185e-05,
4778
+ "loss": 2.1589,
4779
+ "step": 678
4780
+ },
4781
+ {
4782
+ "epoch": 0.6180452838775743,
4783
+ "grad_norm": 0.4258089065551758,
4784
+ "learning_rate": 6.485094262002529e-05,
4785
+ "loss": 1.9598,
4786
+ "step": 679
4787
+ },
4788
+ {
4789
+ "epoch": 0.6189555125725339,
4790
+ "grad_norm": 0.436363160610199,
4791
+ "learning_rate": 6.458101284659286e-05,
4792
+ "loss": 2.2201,
4793
+ "step": 680
4794
+ },
4795
+ {
4796
+ "epoch": 0.6198657412674935,
4797
+ "grad_norm": 0.41596871614456177,
4798
+ "learning_rate": 6.431137784081282e-05,
4799
+ "loss": 2.0394,
4800
+ "step": 681
4801
+ },
4802
+ {
4803
+ "epoch": 0.6207759699624531,
4804
+ "grad_norm": 0.46983107924461365,
4805
+ "learning_rate": 6.404203984667019e-05,
4806
+ "loss": 2.0277,
4807
+ "step": 682
4808
+ },
4809
+ {
4810
+ "epoch": 0.6216861986574127,
4811
+ "grad_norm": 0.4560098350048065,
4812
+ "learning_rate": 6.377300110567821e-05,
4813
+ "loss": 2.2369,
4814
+ "step": 683
4815
+ },
4816
+ {
4817
+ "epoch": 0.6225964273523723,
4818
+ "grad_norm": 0.45281484723091125,
4819
+ "learning_rate": 6.350426385685957e-05,
4820
+ "loss": 2.2576,
4821
+ "step": 684
4822
+ },
4823
+ {
4824
+ "epoch": 0.6235066560473319,
4825
+ "grad_norm": 0.45989713072776794,
4826
+ "learning_rate": 6.323583033672799e-05,
4827
+ "loss": 2.1294,
4828
+ "step": 685
4829
+ },
4830
+ {
4831
+ "epoch": 0.6244168847422915,
4832
+ "grad_norm": 0.46797510981559753,
4833
+ "learning_rate": 6.296770277926937e-05,
4834
+ "loss": 2.0688,
4835
+ "step": 686
4836
+ },
4837
+ {
4838
+ "epoch": 0.6253271134372511,
4839
+ "grad_norm": 0.5156514644622803,
4840
+ "learning_rate": 6.269988341592328e-05,
4841
+ "loss": 2.1114,
4842
+ "step": 687
4843
+ },
4844
+ {
4845
+ "epoch": 0.6262373421322107,
4846
+ "grad_norm": 0.48389503359794617,
4847
+ "learning_rate": 6.243237447556449e-05,
4848
+ "loss": 2.0931,
4849
+ "step": 688
4850
+ },
4851
+ {
4852
+ "epoch": 0.6271475708271703,
4853
+ "grad_norm": 0.4714515209197998,
4854
+ "learning_rate": 6.216517818448423e-05,
4855
+ "loss": 2.0982,
4856
+ "step": 689
4857
+ },
4858
+ {
4859
+ "epoch": 0.6280577995221299,
4860
+ "grad_norm": 0.5029696226119995,
4861
+ "learning_rate": 6.189829676637182e-05,
4862
+ "loss": 2.2145,
4863
+ "step": 690
4864
+ },
4865
+ {
4866
+ "epoch": 0.6289680282170895,
4867
+ "grad_norm": 0.49942919611930847,
4868
+ "learning_rate": 6.163173244229619e-05,
4869
+ "loss": 2.1344,
4870
+ "step": 691
4871
+ },
4872
+ {
4873
+ "epoch": 0.6298782569120491,
4874
+ "grad_norm": 0.5015403032302856,
4875
+ "learning_rate": 6.136548743068713e-05,
4876
+ "loss": 2.0721,
4877
+ "step": 692
4878
+ },
4879
+ {
4880
+ "epoch": 0.6307884856070087,
4881
+ "grad_norm": 0.5125733613967896,
4882
+ "learning_rate": 6.109956394731722e-05,
4883
+ "loss": 2.0609,
4884
+ "step": 693
4885
+ },
4886
+ {
4887
+ "epoch": 0.6316987143019683,
4888
+ "grad_norm": 0.5657601952552795,
4889
+ "learning_rate": 6.083396420528298e-05,
4890
+ "loss": 2.4455,
4891
+ "step": 694
4892
+ },
4893
+ {
4894
+ "epoch": 0.6326089429969279,
4895
+ "grad_norm": 0.5578758120536804,
4896
+ "learning_rate": 6.056869041498687e-05,
4897
+ "loss": 2.205,
4898
+ "step": 695
4899
+ },
4900
+ {
4901
+ "epoch": 0.6335191716918875,
4902
+ "grad_norm": 0.6066897511482239,
4903
+ "learning_rate": 6.030374478411847e-05,
4904
+ "loss": 2.3107,
4905
+ "step": 696
4906
+ },
4907
+ {
4908
+ "epoch": 0.6344294003868471,
4909
+ "grad_norm": 0.6329779028892517,
4910
+ "learning_rate": 6.0039129517636435e-05,
4911
+ "loss": 2.3413,
4912
+ "step": 697
4913
+ },
4914
+ {
4915
+ "epoch": 0.6353396290818069,
4916
+ "grad_norm": 0.7217928767204285,
4917
+ "learning_rate": 5.9774846817750105e-05,
4918
+ "loss": 2.4855,
4919
+ "step": 698
4920
+ },
4921
+ {
4922
+ "epoch": 0.6362498577767665,
4923
+ "grad_norm": 0.9335882067680359,
4924
+ "learning_rate": 5.951089888390087e-05,
4925
+ "loss": 2.7768,
4926
+ "step": 699
4927
+ },
4928
+ {
4929
+ "epoch": 0.6371600864717261,
4930
+ "grad_norm": 1.5800278186798096,
4931
+ "learning_rate": 5.924728791274432e-05,
4932
+ "loss": 2.5227,
4933
+ "step": 700
4934
+ },
4935
+ {
4936
+ "epoch": 0.6380703151666857,
4937
+ "grad_norm": 0.4945278763771057,
4938
+ "learning_rate": 5.89840160981316e-05,
4939
+ "loss": 2.4396,
4940
+ "step": 701
4941
+ },
4942
+ {
4943
+ "epoch": 0.6389805438616453,
4944
+ "grad_norm": 0.4547106921672821,
4945
+ "learning_rate": 5.872108563109131e-05,
4946
+ "loss": 2.3647,
4947
+ "step": 702
4948
+ },
4949
+ {
4950
+ "epoch": 0.6398907725566049,
4951
+ "grad_norm": 0.46132609248161316,
4952
+ "learning_rate": 5.845849869981137e-05,
4953
+ "loss": 2.5182,
4954
+ "step": 703
4955
+ },
4956
+ {
4957
+ "epoch": 0.6408010012515645,
4958
+ "grad_norm": 0.41916942596435547,
4959
+ "learning_rate": 5.819625748962049e-05,
4960
+ "loss": 2.3084,
4961
+ "step": 704
4962
+ },
4963
+ {
4964
+ "epoch": 0.6417112299465241,
4965
+ "grad_norm": 0.4376005530357361,
4966
+ "learning_rate": 5.79343641829704e-05,
4967
+ "loss": 2.3951,
4968
+ "step": 705
4969
+ },
4970
+ {
4971
+ "epoch": 0.6426214586414837,
4972
+ "grad_norm": 0.4435478448867798,
4973
+ "learning_rate": 5.7672820959417254e-05,
4974
+ "loss": 2.3179,
4975
+ "step": 706
4976
+ },
4977
+ {
4978
+ "epoch": 0.6435316873364433,
4979
+ "grad_norm": 0.43887507915496826,
4980
+ "learning_rate": 5.741162999560386e-05,
4981
+ "loss": 2.2442,
4982
+ "step": 707
4983
+ },
4984
+ {
4985
+ "epoch": 0.6444419160314029,
4986
+ "grad_norm": 0.45083487033843994,
4987
+ "learning_rate": 5.7150793465241346e-05,
4988
+ "loss": 2.3397,
4989
+ "step": 708
4990
+ },
4991
+ {
4992
+ "epoch": 0.6453521447263625,
4993
+ "grad_norm": 0.4538663625717163,
4994
+ "learning_rate": 5.68903135390912e-05,
4995
+ "loss": 2.1915,
4996
+ "step": 709
4997
+ },
4998
+ {
4999
+ "epoch": 0.6462623734213221,
5000
+ "grad_norm": 0.41724029183387756,
5001
+ "learning_rate": 5.663019238494704e-05,
5002
+ "loss": 2.314,
5003
+ "step": 710
5004
+ },
5005
+ {
5006
+ "epoch": 0.6471726021162817,
5007
+ "grad_norm": 0.4224849343299866,
5008
+ "learning_rate": 5.637043216761678e-05,
5009
+ "loss": 2.1712,
5010
+ "step": 711
5011
+ },
5012
+ {
5013
+ "epoch": 0.6480828308112413,
5014
+ "grad_norm": 0.4206700921058655,
5015
+ "learning_rate": 5.611103504890444e-05,
5016
+ "loss": 2.2096,
5017
+ "step": 712
5018
+ },
5019
+ {
5020
+ "epoch": 0.6489930595062009,
5021
+ "grad_norm": 0.41635435819625854,
5022
+ "learning_rate": 5.5852003187592226e-05,
5023
+ "loss": 2.3813,
5024
+ "step": 713
5025
+ },
5026
+ {
5027
+ "epoch": 0.6499032882011605,
5028
+ "grad_norm": 0.4309711456298828,
5029
+ "learning_rate": 5.559333873942259e-05,
5030
+ "loss": 2.3199,
5031
+ "step": 714
5032
+ },
5033
+ {
5034
+ "epoch": 0.6508135168961201,
5035
+ "grad_norm": 0.4129570424556732,
5036
+ "learning_rate": 5.533504385708024e-05,
5037
+ "loss": 2.2384,
5038
+ "step": 715
5039
+ },
5040
+ {
5041
+ "epoch": 0.6517237455910797,
5042
+ "grad_norm": 0.40963393449783325,
5043
+ "learning_rate": 5.5077120690174246e-05,
5044
+ "loss": 2.1162,
5045
+ "step": 716
5046
+ },
5047
+ {
5048
+ "epoch": 0.6526339742860394,
5049
+ "grad_norm": 0.4129562973976135,
5050
+ "learning_rate": 5.481957138522018e-05,
5051
+ "loss": 2.2232,
5052
+ "step": 717
5053
+ },
5054
+ {
5055
+ "epoch": 0.653544202980999,
5056
+ "grad_norm": 0.43267586827278137,
5057
+ "learning_rate": 5.456239808562209e-05,
5058
+ "loss": 2.208,
5059
+ "step": 718
5060
+ },
5061
+ {
5062
+ "epoch": 0.6544544316759586,
5063
+ "grad_norm": 0.4190649688243866,
5064
+ "learning_rate": 5.4305602931655045e-05,
5065
+ "loss": 2.0585,
5066
+ "step": 719
5067
+ },
5068
+ {
5069
+ "epoch": 0.6553646603709182,
5070
+ "grad_norm": 0.41037124395370483,
5071
+ "learning_rate": 5.404918806044679e-05,
5072
+ "loss": 2.1371,
5073
+ "step": 720
5074
+ },
5075
+ {
5076
+ "epoch": 0.6562748890658778,
5077
+ "grad_norm": 0.43521812558174133,
5078
+ "learning_rate": 5.379315560596038e-05,
5079
+ "loss": 2.2227,
5080
+ "step": 721
5081
+ },
5082
+ {
5083
+ "epoch": 0.6571851177608374,
5084
+ "grad_norm": 0.4168831408023834,
5085
+ "learning_rate": 5.3537507698976365e-05,
5086
+ "loss": 1.962,
5087
+ "step": 722
5088
+ },
5089
+ {
5090
+ "epoch": 0.658095346455797,
5091
+ "grad_norm": 0.4054042100906372,
5092
+ "learning_rate": 5.328224646707479e-05,
5093
+ "loss": 2.0154,
5094
+ "step": 723
5095
+ },
5096
+ {
5097
+ "epoch": 0.6590055751507566,
5098
+ "grad_norm": 0.4229868948459625,
5099
+ "learning_rate": 5.3027374034617785e-05,
5100
+ "loss": 2.1075,
5101
+ "step": 724
5102
+ },
5103
+ {
5104
+ "epoch": 0.6599158038457162,
5105
+ "grad_norm": 0.4285801351070404,
5106
+ "learning_rate": 5.277289252273174e-05,
5107
+ "loss": 2.0953,
5108
+ "step": 725
5109
+ },
5110
+ {
5111
+ "epoch": 0.6608260325406758,
5112
+ "grad_norm": 0.43290793895721436,
5113
+ "learning_rate": 5.251880404928971e-05,
5114
+ "loss": 2.3198,
5115
+ "step": 726
5116
+ },
5117
+ {
5118
+ "epoch": 0.6617362612356354,
5119
+ "grad_norm": 0.4112420082092285,
5120
+ "learning_rate": 5.226511072889371e-05,
5121
+ "loss": 2.1214,
5122
+ "step": 727
5123
+ },
5124
+ {
5125
+ "epoch": 0.662646489930595,
5126
+ "grad_norm": 0.4102923572063446,
5127
+ "learning_rate": 5.201181467285723e-05,
5128
+ "loss": 1.8327,
5129
+ "step": 728
5130
+ },
5131
+ {
5132
+ "epoch": 0.6635567186255547,
5133
+ "grad_norm": 0.45263952016830444,
5134
+ "learning_rate": 5.175891798918757e-05,
5135
+ "loss": 2.1415,
5136
+ "step": 729
5137
+ },
5138
+ {
5139
+ "epoch": 0.6644669473205143,
5140
+ "grad_norm": 0.4748065769672394,
5141
+ "learning_rate": 5.1506422782568345e-05,
5142
+ "loss": 2.0534,
5143
+ "step": 730
5144
+ },
5145
+ {
5146
+ "epoch": 0.6653771760154739,
5147
+ "grad_norm": 0.42939677834510803,
5148
+ "learning_rate": 5.125433115434197e-05,
5149
+ "loss": 1.8938,
5150
+ "step": 731
5151
+ },
5152
+ {
5153
+ "epoch": 0.6662874047104335,
5154
+ "grad_norm": 0.4550830125808716,
5155
+ "learning_rate": 5.100264520249205e-05,
5156
+ "loss": 2.1615,
5157
+ "step": 732
5158
+ },
5159
+ {
5160
+ "epoch": 0.6671976334053931,
5161
+ "grad_norm": 0.47507891058921814,
5162
+ "learning_rate": 5.0751367021626215e-05,
5163
+ "loss": 2.1034,
5164
+ "step": 733
5165
+ },
5166
+ {
5167
+ "epoch": 0.6681078621003527,
5168
+ "grad_norm": 0.44998088479042053,
5169
+ "learning_rate": 5.050049870295841e-05,
5170
+ "loss": 1.9552,
5171
+ "step": 734
5172
+ },
5173
+ {
5174
+ "epoch": 0.6690180907953123,
5175
+ "grad_norm": 0.4823133647441864,
5176
+ "learning_rate": 5.025004233429145e-05,
5177
+ "loss": 2.0965,
5178
+ "step": 735
5179
+ },
5180
+ {
5181
+ "epoch": 0.669928319490272,
5182
+ "grad_norm": 0.4999094605445862,
5183
+ "learning_rate": 5.000000000000002e-05,
5184
+ "loss": 2.2037,
5185
+ "step": 736
5186
+ },
5187
+ {
5188
+ "epoch": 0.6708385481852316,
5189
+ "grad_norm": 0.4802190661430359,
5190
+ "learning_rate": 4.9750373781012885e-05,
5191
+ "loss": 2.0832,
5192
+ "step": 737
5193
+ },
5194
+ {
5195
+ "epoch": 0.6717487768801912,
5196
+ "grad_norm": 0.4969741702079773,
5197
+ "learning_rate": 4.950116575479586e-05,
5198
+ "loss": 2.0196,
5199
+ "step": 738
5200
+ },
5201
+ {
5202
+ "epoch": 0.6726590055751508,
5203
+ "grad_norm": 0.47610992193222046,
5204
+ "learning_rate": 4.9252377995334444e-05,
5205
+ "loss": 1.9983,
5206
+ "step": 739
5207
+ },
5208
+ {
5209
+ "epoch": 0.6735692342701104,
5210
+ "grad_norm": 0.5233409404754639,
5211
+ "learning_rate": 4.90040125731165e-05,
5212
+ "loss": 2.2088,
5213
+ "step": 740
5214
+ },
5215
+ {
5216
+ "epoch": 0.67447946296507,
5217
+ "grad_norm": 0.5126591920852661,
5218
+ "learning_rate": 4.87560715551151e-05,
5219
+ "loss": 2.0659,
5220
+ "step": 741
5221
+ },
5222
+ {
5223
+ "epoch": 0.6753896916600296,
5224
+ "grad_norm": 0.5209243893623352,
5225
+ "learning_rate": 4.85085570047713e-05,
5226
+ "loss": 2.2167,
5227
+ "step": 742
5228
+ },
5229
+ {
5230
+ "epoch": 0.6762999203549892,
5231
+ "grad_norm": 0.5382196307182312,
5232
+ "learning_rate": 4.826147098197691e-05,
5233
+ "loss": 2.234,
5234
+ "step": 743
5235
+ },
5236
+ {
5237
+ "epoch": 0.6772101490499488,
5238
+ "grad_norm": 0.5741109848022461,
5239
+ "learning_rate": 4.8014815543057475e-05,
5240
+ "loss": 2.2118,
5241
+ "step": 744
5242
+ },
5243
+ {
5244
+ "epoch": 0.6781203777449084,
5245
+ "grad_norm": 0.577425479888916,
5246
+ "learning_rate": 4.776859274075506e-05,
5247
+ "loss": 2.2446,
5248
+ "step": 745
5249
+ },
5250
+ {
5251
+ "epoch": 0.679030606439868,
5252
+ "grad_norm": 0.614007294178009,
5253
+ "learning_rate": 4.752280462421117e-05,
5254
+ "loss": 2.3033,
5255
+ "step": 746
5256
+ },
5257
+ {
5258
+ "epoch": 0.6799408351348276,
5259
+ "grad_norm": 0.654388427734375,
5260
+ "learning_rate": 4.727745323894976e-05,
5261
+ "loss": 2.33,
5262
+ "step": 747
5263
+ },
5264
+ {
5265
+ "epoch": 0.6808510638297872,
5266
+ "grad_norm": 0.7294265627861023,
5267
+ "learning_rate": 4.703254062686017e-05,
5268
+ "loss": 2.5732,
5269
+ "step": 748
5270
+ },
5271
+ {
5272
+ "epoch": 0.6817612925247468,
5273
+ "grad_norm": 0.8145880699157715,
5274
+ "learning_rate": 4.678806882618003e-05,
5275
+ "loss": 2.4708,
5276
+ "step": 749
5277
+ },
5278
+ {
5279
+ "epoch": 0.6826715212197064,
5280
+ "grad_norm": 1.3215488195419312,
5281
+ "learning_rate": 4.654403987147865e-05,
5282
+ "loss": 2.566,
5283
+ "step": 750
5284
+ },
5285
+ {
5286
+ "epoch": 0.683581749914666,
5287
+ "grad_norm": 0.4625120759010315,
5288
+ "learning_rate": 4.630045579363957e-05,
5289
+ "loss": 2.4207,
5290
+ "step": 751
5291
+ },
5292
+ {
5293
+ "epoch": 0.6844919786096256,
5294
+ "grad_norm": 0.4124356508255005,
5295
+ "learning_rate": 4.605731861984401e-05,
5296
+ "loss": 2.2606,
5297
+ "step": 752
5298
+ },
5299
+ {
5300
+ "epoch": 0.6854022073045852,
5301
+ "grad_norm": 0.42320266366004944,
5302
+ "learning_rate": 4.5814630373554115e-05,
5303
+ "loss": 2.3107,
5304
+ "step": 753
5305
+ },
5306
+ {
5307
+ "epoch": 0.6863124359995448,
5308
+ "grad_norm": 0.4337674379348755,
5309
+ "learning_rate": 4.557239307449561e-05,
5310
+ "loss": 2.5028,
5311
+ "step": 754
5312
+ },
5313
+ {
5314
+ "epoch": 0.6872226646945045,
5315
+ "grad_norm": 0.44273191690444946,
5316
+ "learning_rate": 4.5330608738641486e-05,
5317
+ "loss": 2.42,
5318
+ "step": 755
5319
+ },
5320
+ {
5321
+ "epoch": 0.6881328933894642,
5322
+ "grad_norm": 0.40541112422943115,
5323
+ "learning_rate": 4.508927937819499e-05,
5324
+ "loss": 2.1912,
5325
+ "step": 756
5326
+ },
5327
+ {
5328
+ "epoch": 0.6890431220844238,
5329
+ "grad_norm": 0.4108361601829529,
5330
+ "learning_rate": 4.484840700157295e-05,
5331
+ "loss": 2.2871,
5332
+ "step": 757
5333
+ },
5334
+ {
5335
+ "epoch": 0.6899533507793834,
5336
+ "grad_norm": 0.4105437994003296,
5337
+ "learning_rate": 4.4607993613388976e-05,
5338
+ "loss": 2.244,
5339
+ "step": 758
5340
+ },
5341
+ {
5342
+ "epoch": 0.690863579474343,
5343
+ "grad_norm": 0.42231062054634094,
5344
+ "learning_rate": 4.436804121443689e-05,
5345
+ "loss": 2.3462,
5346
+ "step": 759
5347
+ },
5348
+ {
5349
+ "epoch": 0.6917738081693026,
5350
+ "grad_norm": 0.42183414101600647,
5351
+ "learning_rate": 4.412855180167406e-05,
5352
+ "loss": 2.3269,
5353
+ "step": 760
5354
+ },
5355
+ {
5356
+ "epoch": 0.6926840368642622,
5357
+ "grad_norm": 0.4242246448993683,
5358
+ "learning_rate": 4.388952736820453e-05,
5359
+ "loss": 2.2572,
5360
+ "step": 761
5361
+ },
5362
+ {
5363
+ "epoch": 0.6935942655592218,
5364
+ "grad_norm": 0.41937458515167236,
5365
+ "learning_rate": 4.365096990326297e-05,
5366
+ "loss": 2.1373,
5367
+ "step": 762
5368
+ },
5369
+ {
5370
+ "epoch": 0.6945044942541814,
5371
+ "grad_norm": 0.4179951250553131,
5372
+ "learning_rate": 4.3412881392197526e-05,
5373
+ "loss": 2.2584,
5374
+ "step": 763
5375
+ },
5376
+ {
5377
+ "epoch": 0.695414722949141,
5378
+ "grad_norm": 0.4121822416782379,
5379
+ "learning_rate": 4.317526381645363e-05,
5380
+ "loss": 2.2389,
5381
+ "step": 764
5382
+ },
5383
+ {
5384
+ "epoch": 0.6963249516441006,
5385
+ "grad_norm": 0.400831401348114,
5386
+ "learning_rate": 4.293811915355761e-05,
5387
+ "loss": 2.1734,
5388
+ "step": 765
5389
+ },
5390
+ {
5391
+ "epoch": 0.6972351803390602,
5392
+ "grad_norm": 0.426580011844635,
5393
+ "learning_rate": 4.270144937709981e-05,
5394
+ "loss": 2.1537,
5395
+ "step": 766
5396
+ },
5397
+ {
5398
+ "epoch": 0.6981454090340198,
5399
+ "grad_norm": 0.4221407175064087,
5400
+ "learning_rate": 4.2465256456718615e-05,
5401
+ "loss": 2.1184,
5402
+ "step": 767
5403
+ },
5404
+ {
5405
+ "epoch": 0.6990556377289794,
5406
+ "grad_norm": 0.3915541172027588,
5407
+ "learning_rate": 4.222954235808378e-05,
5408
+ "loss": 2.0476,
5409
+ "step": 768
5410
+ },
5411
+ {
5412
+ "epoch": 0.699965866423939,
5413
+ "grad_norm": 0.39140835404396057,
5414
+ "learning_rate": 4.19943090428802e-05,
5415
+ "loss": 1.9244,
5416
+ "step": 769
5417
+ },
5418
+ {
5419
+ "epoch": 0.7008760951188986,
5420
+ "grad_norm": 0.4446852505207062,
5421
+ "learning_rate": 4.175955846879151e-05,
5422
+ "loss": 2.1622,
5423
+ "step": 770
5424
+ },
5425
+ {
5426
+ "epoch": 0.7017863238138582,
5427
+ "grad_norm": 0.4065980911254883,
5428
+ "learning_rate": 4.1525292589483843e-05,
5429
+ "loss": 1.9522,
5430
+ "step": 771
5431
+ },
5432
+ {
5433
+ "epoch": 0.7026965525088178,
5434
+ "grad_norm": 0.41551730036735535,
5435
+ "learning_rate": 4.129151335458957e-05,
5436
+ "loss": 1.9784,
5437
+ "step": 772
5438
+ },
5439
+ {
5440
+ "epoch": 0.7036067812037774,
5441
+ "grad_norm": 0.4159603416919708,
5442
+ "learning_rate": 4.105822270969102e-05,
5443
+ "loss": 2.0386,
5444
+ "step": 773
5445
+ },
5446
+ {
5447
+ "epoch": 0.7045170098987371,
5448
+ "grad_norm": 0.4357737600803375,
5449
+ "learning_rate": 4.0825422596304396e-05,
5450
+ "loss": 2.1812,
5451
+ "step": 774
5452
+ },
5453
+ {
5454
+ "epoch": 0.7054272385936967,
5455
+ "grad_norm": 0.43295785784721375,
5456
+ "learning_rate": 4.059311495186338e-05,
5457
+ "loss": 2.1118,
5458
+ "step": 775
5459
+ },
5460
+ {
5461
+ "epoch": 0.7063374672886563,
5462
+ "grad_norm": 0.4281920790672302,
5463
+ "learning_rate": 4.036130170970341e-05,
5464
+ "loss": 2.1594,
5465
+ "step": 776
5466
+ },
5467
+ {
5468
+ "epoch": 0.7072476959836159,
5469
+ "grad_norm": 0.45081713795661926,
5470
+ "learning_rate": 4.012998479904525e-05,
5471
+ "loss": 2.2,
5472
+ "step": 777
5473
+ },
5474
+ {
5475
+ "epoch": 0.7081579246785755,
5476
+ "grad_norm": 0.4478646218776703,
5477
+ "learning_rate": 3.9899166144978904e-05,
5478
+ "loss": 2.1344,
5479
+ "step": 778
5480
+ },
5481
+ {
5482
+ "epoch": 0.7090681533735351,
5483
+ "grad_norm": 0.44155117869377136,
5484
+ "learning_rate": 3.966884766844803e-05,
5485
+ "loss": 2.1462,
5486
+ "step": 779
5487
+ },
5488
+ {
5489
+ "epoch": 0.7099783820684947,
5490
+ "grad_norm": 0.4430200755596161,
5491
+ "learning_rate": 3.943903128623335e-05,
5492
+ "loss": 2.0047,
5493
+ "step": 780
5494
+ },
5495
+ {
5496
+ "epoch": 0.7108886107634543,
5497
+ "grad_norm": 0.43685182929039,
5498
+ "learning_rate": 3.920971891093718e-05,
5499
+ "loss": 1.9843,
5500
+ "step": 781
5501
+ },
5502
+ {
5503
+ "epoch": 0.7117988394584139,
5504
+ "grad_norm": 0.44161438941955566,
5505
+ "learning_rate": 3.8980912450967366e-05,
5506
+ "loss": 2.02,
5507
+ "step": 782
5508
+ },
5509
+ {
5510
+ "epoch": 0.7127090681533735,
5511
+ "grad_norm": 0.45051446557044983,
5512
+ "learning_rate": 3.875261381052121e-05,
5513
+ "loss": 2.0351,
5514
+ "step": 783
5515
+ },
5516
+ {
5517
+ "epoch": 0.7136192968483331,
5518
+ "grad_norm": 0.47110506892204285,
5519
+ "learning_rate": 3.852482488956992e-05,
5520
+ "loss": 2.0375,
5521
+ "step": 784
5522
+ },
5523
+ {
5524
+ "epoch": 0.7145295255432927,
5525
+ "grad_norm": 0.49640730023384094,
5526
+ "learning_rate": 3.829754758384262e-05,
5527
+ "loss": 2.301,
5528
+ "step": 785
5529
+ },
5530
+ {
5531
+ "epoch": 0.7154397542382523,
5532
+ "grad_norm": 0.5000788569450378,
5533
+ "learning_rate": 3.807078378481059e-05,
5534
+ "loss": 2.3439,
5535
+ "step": 786
5536
+ },
5537
+ {
5538
+ "epoch": 0.716349982933212,
5539
+ "grad_norm": 0.4984182119369507,
5540
+ "learning_rate": 3.784453537967161e-05,
5541
+ "loss": 2.1625,
5542
+ "step": 787
5543
+ },
5544
+ {
5545
+ "epoch": 0.7172602116281716,
5546
+ "grad_norm": 0.4947352409362793,
5547
+ "learning_rate": 3.761880425133413e-05,
5548
+ "loss": 2.1349,
5549
+ "step": 788
5550
+ },
5551
+ {
5552
+ "epoch": 0.7181704403231312,
5553
+ "grad_norm": 0.48531001806259155,
5554
+ "learning_rate": 3.7393592278401704e-05,
5555
+ "loss": 2.0905,
5556
+ "step": 789
5557
+ },
5558
+ {
5559
+ "epoch": 0.7190806690180908,
5560
+ "grad_norm": 0.5409421324729919,
5561
+ "learning_rate": 3.7168901335157315e-05,
5562
+ "loss": 2.4218,
5563
+ "step": 790
5564
+ },
5565
+ {
5566
+ "epoch": 0.7199908977130504,
5567
+ "grad_norm": 0.5206452012062073,
5568
+ "learning_rate": 3.694473329154778e-05,
5569
+ "loss": 1.98,
5570
+ "step": 791
5571
+ },
5572
+ {
5573
+ "epoch": 0.72090112640801,
5574
+ "grad_norm": 0.5477956533432007,
5575
+ "learning_rate": 3.672109001316809e-05,
5576
+ "loss": 2.4692,
5577
+ "step": 792
5578
+ },
5579
+ {
5580
+ "epoch": 0.7218113551029697,
5581
+ "grad_norm": 0.5402776002883911,
5582
+ "learning_rate": 3.649797336124615e-05,
5583
+ "loss": 2.0121,
5584
+ "step": 793
5585
+ },
5586
+ {
5587
+ "epoch": 0.7227215837979293,
5588
+ "grad_norm": 0.529184877872467,
5589
+ "learning_rate": 3.6275385192627056e-05,
5590
+ "loss": 2.1043,
5591
+ "step": 794
5592
+ },
5593
+ {
5594
+ "epoch": 0.7236318124928889,
5595
+ "grad_norm": 0.5657643675804138,
5596
+ "learning_rate": 3.6053327359757535e-05,
5597
+ "loss": 2.1002,
5598
+ "step": 795
5599
+ },
5600
+ {
5601
+ "epoch": 0.7245420411878485,
5602
+ "grad_norm": 0.6207425594329834,
5603
+ "learning_rate": 3.583180171067101e-05,
5604
+ "loss": 2.3302,
5605
+ "step": 796
5606
+ },
5607
+ {
5608
+ "epoch": 0.7254522698828081,
5609
+ "grad_norm": 0.6927235722541809,
5610
+ "learning_rate": 3.5610810088971625e-05,
5611
+ "loss": 2.446,
5612
+ "step": 797
5613
+ },
5614
+ {
5615
+ "epoch": 0.7263624985777677,
5616
+ "grad_norm": 0.7733549475669861,
5617
+ "learning_rate": 3.5390354333819344e-05,
5618
+ "loss": 2.6638,
5619
+ "step": 798
5620
+ },
5621
+ {
5622
+ "epoch": 0.7272727272727273,
5623
+ "grad_norm": 0.9281876087188721,
5624
+ "learning_rate": 3.517043627991441e-05,
5625
+ "loss": 2.5248,
5626
+ "step": 799
5627
+ },
5628
+ {
5629
+ "epoch": 0.7281829559676869,
5630
+ "grad_norm": 1.9464212656021118,
5631
+ "learning_rate": 3.4951057757482205e-05,
5632
+ "loss": 2.603,
5633
+ "step": 800
5634
+ },
5635
+ {
5636
+ "epoch": 0.7290931846626465,
5637
+ "grad_norm": 0.42626360058784485,
5638
+ "learning_rate": 3.4732220592257946e-05,
5639
+ "loss": 2.5094,
5640
+ "step": 801
5641
+ },
5642
+ {
5643
+ "epoch": 0.7300034133576061,
5644
+ "grad_norm": 0.42765355110168457,
5645
+ "learning_rate": 3.45139266054715e-05,
5646
+ "loss": 2.172,
5647
+ "step": 802
5648
+ },
5649
+ {
5650
+ "epoch": 0.7309136420525657,
5651
+ "grad_norm": 0.42991819977760315,
5652
+ "learning_rate": 3.429617761383222e-05,
5653
+ "loss": 2.2523,
5654
+ "step": 803
5655
+ },
5656
+ {
5657
+ "epoch": 0.7318238707475253,
5658
+ "grad_norm": 0.4252484142780304,
5659
+ "learning_rate": 3.40789754295139e-05,
5660
+ "loss": 2.4049,
5661
+ "step": 804
5662
+ },
5663
+ {
5664
+ "epoch": 0.7327340994424849,
5665
+ "grad_norm": 0.4209078252315521,
5666
+ "learning_rate": 3.3862321860139576e-05,
5667
+ "loss": 2.4249,
5668
+ "step": 805
5669
+ },
5670
+ {
5671
+ "epoch": 0.7336443281374445,
5672
+ "grad_norm": 0.4354207515716553,
5673
+ "learning_rate": 3.364621870876659e-05,
5674
+ "loss": 2.4072,
5675
+ "step": 806
5676
+ },
5677
+ {
5678
+ "epoch": 0.7345545568324041,
5679
+ "grad_norm": 0.4264775514602661,
5680
+ "learning_rate": 3.343066777387148e-05,
5681
+ "loss": 2.3709,
5682
+ "step": 807
5683
+ },
5684
+ {
5685
+ "epoch": 0.7354647855273637,
5686
+ "grad_norm": 0.41826266050338745,
5687
+ "learning_rate": 3.3215670849335155e-05,
5688
+ "loss": 2.2593,
5689
+ "step": 808
5690
+ },
5691
+ {
5692
+ "epoch": 0.7363750142223233,
5693
+ "grad_norm": 0.4268350899219513,
5694
+ "learning_rate": 3.300122972442773e-05,
5695
+ "loss": 2.3383,
5696
+ "step": 809
5697
+ },
5698
+ {
5699
+ "epoch": 0.7372852429172829,
5700
+ "grad_norm": 0.40746137499809265,
5701
+ "learning_rate": 3.278734618379402e-05,
5702
+ "loss": 2.1898,
5703
+ "step": 810
5704
+ },
5705
+ {
5706
+ "epoch": 0.7381954716122425,
5707
+ "grad_norm": 0.4189973771572113,
5708
+ "learning_rate": 3.257402200743821e-05,
5709
+ "loss": 2.3329,
5710
+ "step": 811
5711
+ },
5712
+ {
5713
+ "epoch": 0.7391057003072021,
5714
+ "grad_norm": 0.44023388624191284,
5715
+ "learning_rate": 3.2361258970709397e-05,
5716
+ "loss": 2.3947,
5717
+ "step": 812
5718
+ },
5719
+ {
5720
+ "epoch": 0.7400159290021618,
5721
+ "grad_norm": 0.41705650091171265,
5722
+ "learning_rate": 3.21490588442868e-05,
5723
+ "loss": 2.261,
5724
+ "step": 813
5725
+ },
5726
+ {
5727
+ "epoch": 0.7409261576971214,
5728
+ "grad_norm": 0.431820273399353,
5729
+ "learning_rate": 3.19374233941647e-05,
5730
+ "loss": 2.4248,
5731
+ "step": 814
5732
+ },
5733
+ {
5734
+ "epoch": 0.741836386392081,
5735
+ "grad_norm": 0.4188806414604187,
5736
+ "learning_rate": 3.172635438163816e-05,
5737
+ "loss": 2.2794,
5738
+ "step": 815
5739
+ },
5740
+ {
5741
+ "epoch": 0.7427466150870407,
5742
+ "grad_norm": 0.41158682107925415,
5743
+ "learning_rate": 3.1515853563288076e-05,
5744
+ "loss": 2.1274,
5745
+ "step": 816
5746
+ },
5747
+ {
5748
+ "epoch": 0.7436568437820003,
5749
+ "grad_norm": 0.4038848876953125,
5750
+ "learning_rate": 3.130592269096671e-05,
5751
+ "loss": 2.0359,
5752
+ "step": 817
5753
+ },
5754
+ {
5755
+ "epoch": 0.7445670724769599,
5756
+ "grad_norm": 0.4189314842224121,
5757
+ "learning_rate": 3.1096563511783014e-05,
5758
+ "loss": 2.1405,
5759
+ "step": 818
5760
+ },
5761
+ {
5762
+ "epoch": 0.7454773011719195,
5763
+ "grad_norm": 0.39889493584632874,
5764
+ "learning_rate": 3.08877777680882e-05,
5765
+ "loss": 2.0187,
5766
+ "step": 819
5767
+ },
5768
+ {
5769
+ "epoch": 0.7463875298668791,
5770
+ "grad_norm": 0.38899531960487366,
5771
+ "learning_rate": 3.0679567197461134e-05,
5772
+ "loss": 2.0957,
5773
+ "step": 820
5774
+ },
5775
+ {
5776
+ "epoch": 0.7472977585618387,
5777
+ "grad_norm": 0.41540107131004333,
5778
+ "learning_rate": 3.047193353269382e-05,
5779
+ "loss": 2.1757,
5780
+ "step": 821
5781
+ },
5782
+ {
5783
+ "epoch": 0.7482079872567983,
5784
+ "grad_norm": 0.41734689474105835,
5785
+ "learning_rate": 3.0264878501777306e-05,
5786
+ "loss": 2.0902,
5787
+ "step": 822
5788
+ },
5789
+ {
5790
+ "epoch": 0.7491182159517579,
5791
+ "grad_norm": 0.44342851638793945,
5792
+ "learning_rate": 3.005840382788685e-05,
5793
+ "loss": 2.087,
5794
+ "step": 823
5795
+ },
5796
+ {
5797
+ "epoch": 0.7500284446467175,
5798
+ "grad_norm": 0.430385559797287,
5799
+ "learning_rate": 2.9852511229367865e-05,
5800
+ "loss": 2.1539,
5801
+ "step": 824
5802
+ },
5803
+ {
5804
+ "epoch": 0.7509386733416771,
5805
+ "grad_norm": 0.4446250796318054,
5806
+ "learning_rate": 2.9647202419721687e-05,
5807
+ "loss": 2.2294,
5808
+ "step": 825
5809
+ },
5810
+ {
5811
+ "epoch": 0.7509386733416771,
5812
+ "eval_loss": 2.2184133529663086,
5813
+ "eval_runtime": 204.0107,
5814
+ "eval_samples_per_second": 9.073,
5815
+ "eval_steps_per_second": 4.539,
5816
+ "step": 825
5817
  }
5818
  ],
5819
  "logging_steps": 1,
 
5833
  "attributes": {}
5834
  }
5835
  },
5836
+ "total_flos": 1.5204499502137344e+18,
5837
  "train_batch_size": 2,
5838
  "trial_name": null,
5839
  "trial_params": null