dabrown commited on
Commit
7efac53
·
verified ·
1 Parent(s): 0fa293f

Training in progress, step 825, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7f2bba67c1c1484d2bc04c25f371989dddfa7218d9db23366bcfd17cb36894c8
3
  size 80792096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2788e6dfb6435e62bc41b789bd050e58a00b8e97462570125c1f3a3b3a5752c3
3
  size 80792096
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:507ef66950bd52137d14c378fc2f78b5ad9af0c9387506f9ca6699bcba5321d8
3
  size 41460084
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51f9f9871646780dbef98536e3ba6799fe3cd11660641ef8398a22395fd35d67
3
  size 41460084
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b5d154d045ee189af4c648f80535098cfde6139351de9c4d32c890f904602cee
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37ed957e35db52d753e90b2e89a572c6b011e6c971890858f14f12a4305efd1f
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:789cac547d76905ddc88036180d9f246f307a104c94da93e131a174052f790e8
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:797085a729e10588f0af8dfcea7980f4fc8438c6de826417968959a62c5bdc9a
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5006257822277848,
5
  "eval_steps": 275,
6
- "global_step": 550,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3881,6 +3881,1939 @@
3881
  "eval_samples_per_second": 9.013,
3882
  "eval_steps_per_second": 4.509,
3883
  "step": 550
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3884
  }
3885
  ],
3886
  "logging_steps": 1,
@@ -3900,7 +5833,7 @@
3900
  "attributes": {}
3901
  }
3902
  },
3903
- "total_flos": 1.0074178982447677e+18,
3904
  "train_batch_size": 2,
3905
  "trial_name": null,
3906
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.7509386733416771,
5
  "eval_steps": 275,
6
+ "global_step": 825,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3881
  "eval_samples_per_second": 9.013,
3882
  "eval_steps_per_second": 4.509,
3883
  "step": 550
3884
+ },
3885
+ {
3886
+ "epoch": 0.5015360109227444,
3887
+ "grad_norm": 0.4380887746810913,
3888
+ "learning_rate": 0.00010100967745256766,
3889
+ "loss": 2.4596,
3890
+ "step": 551
3891
+ },
3892
+ {
3893
+ "epoch": 0.502446239617704,
3894
+ "grad_norm": 0.4522015154361725,
3895
+ "learning_rate": 0.00010072120418249745,
3896
+ "loss": 2.3217,
3897
+ "step": 552
3898
+ },
3899
+ {
3900
+ "epoch": 0.5033564683126636,
3901
+ "grad_norm": 0.4581010043621063,
3902
+ "learning_rate": 0.00010043272491034523,
3903
+ "loss": 2.4948,
3904
+ "step": 553
3905
+ },
3906
+ {
3907
+ "epoch": 0.5042666970076232,
3908
+ "grad_norm": 0.4353744685649872,
3909
+ "learning_rate": 0.00010014424203692388,
3910
+ "loss": 2.3769,
3911
+ "step": 554
3912
+ },
3913
+ {
3914
+ "epoch": 0.5051769257025828,
3915
+ "grad_norm": 0.4150161147117615,
3916
+ "learning_rate": 9.985575796307615e-05,
3917
+ "loss": 2.3557,
3918
+ "step": 555
3919
+ },
3920
+ {
3921
+ "epoch": 0.5060871543975424,
3922
+ "grad_norm": 0.4532707631587982,
3923
+ "learning_rate": 9.956727508965481e-05,
3924
+ "loss": 2.3114,
3925
+ "step": 556
3926
+ },
3927
+ {
3928
+ "epoch": 0.506997383092502,
3929
+ "grad_norm": 0.42450255155563354,
3930
+ "learning_rate": 9.927879581750259e-05,
3931
+ "loss": 2.2911,
3932
+ "step": 557
3933
+ },
3934
+ {
3935
+ "epoch": 0.5079076117874616,
3936
+ "grad_norm": 0.42910221219062805,
3937
+ "learning_rate": 9.899032254743235e-05,
3938
+ "loss": 2.3062,
3939
+ "step": 558
3940
+ },
3941
+ {
3942
+ "epoch": 0.5088178404824212,
3943
+ "grad_norm": 0.42122408747673035,
3944
+ "learning_rate": 9.870185768020693e-05,
3945
+ "loss": 2.3294,
3946
+ "step": 559
3947
+ },
3948
+ {
3949
+ "epoch": 0.5097280691773808,
3950
+ "grad_norm": 0.4562203884124756,
3951
+ "learning_rate": 9.84134036165192e-05,
3952
+ "loss": 2.2674,
3953
+ "step": 560
3954
+ },
3955
+ {
3956
+ "epoch": 0.5106382978723404,
3957
+ "grad_norm": 0.3905525207519531,
3958
+ "learning_rate": 9.812496275697226e-05,
3959
+ "loss": 2.1259,
3960
+ "step": 561
3961
+ },
3962
+ {
3963
+ "epoch": 0.5115485265673,
3964
+ "grad_norm": 0.41641199588775635,
3965
+ "learning_rate": 9.783653750205915e-05,
3966
+ "loss": 2.2191,
3967
+ "step": 562
3968
+ },
3969
+ {
3970
+ "epoch": 0.5124587552622596,
3971
+ "grad_norm": 0.4090450704097748,
3972
+ "learning_rate": 9.754813025214317e-05,
3973
+ "loss": 2.2478,
3974
+ "step": 563
3975
+ },
3976
+ {
3977
+ "epoch": 0.5133689839572193,
3978
+ "grad_norm": 0.4293293356895447,
3979
+ "learning_rate": 9.725974340743769e-05,
3980
+ "loss": 2.3854,
3981
+ "step": 564
3982
+ },
3983
+ {
3984
+ "epoch": 0.5142792126521789,
3985
+ "grad_norm": 0.4138126075267792,
3986
+ "learning_rate": 9.697137936798634e-05,
3987
+ "loss": 2.2903,
3988
+ "step": 565
3989
+ },
3990
+ {
3991
+ "epoch": 0.5151894413471385,
3992
+ "grad_norm": 0.3979492783546448,
3993
+ "learning_rate": 9.668304053364294e-05,
3994
+ "loss": 2.1878,
3995
+ "step": 566
3996
+ },
3997
+ {
3998
+ "epoch": 0.5160996700420981,
3999
+ "grad_norm": 0.38530561327934265,
4000
+ "learning_rate": 9.639472930405143e-05,
4001
+ "loss": 2.1464,
4002
+ "step": 567
4003
+ },
4004
+ {
4005
+ "epoch": 0.5170098987370577,
4006
+ "grad_norm": 0.4263143837451935,
4007
+ "learning_rate": 9.610644807862625e-05,
4008
+ "loss": 2.1856,
4009
+ "step": 568
4010
+ },
4011
+ {
4012
+ "epoch": 0.5179201274320173,
4013
+ "grad_norm": 0.41127926111221313,
4014
+ "learning_rate": 9.581819925653188e-05,
4015
+ "loss": 2.198,
4016
+ "step": 569
4017
+ },
4018
+ {
4019
+ "epoch": 0.5188303561269769,
4020
+ "grad_norm": 0.3917299509048462,
4021
+ "learning_rate": 9.552998523666326e-05,
4022
+ "loss": 2.1325,
4023
+ "step": 570
4024
+ },
4025
+ {
4026
+ "epoch": 0.5197405848219365,
4027
+ "grad_norm": 0.39180508255958557,
4028
+ "learning_rate": 9.524180841762577e-05,
4029
+ "loss": 2.0702,
4030
+ "step": 571
4031
+ },
4032
+ {
4033
+ "epoch": 0.5206508135168961,
4034
+ "grad_norm": 0.3829837739467621,
4035
+ "learning_rate": 9.495367119771503e-05,
4036
+ "loss": 1.8913,
4037
+ "step": 572
4038
+ },
4039
+ {
4040
+ "epoch": 0.5215610422118557,
4041
+ "grad_norm": 0.412706196308136,
4042
+ "learning_rate": 9.46655759748972e-05,
4043
+ "loss": 2.1221,
4044
+ "step": 573
4045
+ },
4046
+ {
4047
+ "epoch": 0.5224712709068153,
4048
+ "grad_norm": 0.39748141169548035,
4049
+ "learning_rate": 9.437752514678887e-05,
4050
+ "loss": 2.0427,
4051
+ "step": 574
4052
+ },
4053
+ {
4054
+ "epoch": 0.5233814996017749,
4055
+ "grad_norm": 0.42854538559913635,
4056
+ "learning_rate": 9.408952111063727e-05,
4057
+ "loss": 2.121,
4058
+ "step": 575
4059
+ },
4060
+ {
4061
+ "epoch": 0.5242917282967345,
4062
+ "grad_norm": 0.414654016494751,
4063
+ "learning_rate": 9.380156626330009e-05,
4064
+ "loss": 2.038,
4065
+ "step": 576
4066
+ },
4067
+ {
4068
+ "epoch": 0.5252019569916941,
4069
+ "grad_norm": 0.4241427183151245,
4070
+ "learning_rate": 9.35136630012257e-05,
4071
+ "loss": 2.1312,
4072
+ "step": 577
4073
+ },
4074
+ {
4075
+ "epoch": 0.5261121856866537,
4076
+ "grad_norm": 0.42928779125213623,
4077
+ "learning_rate": 9.322581372043321e-05,
4078
+ "loss": 2.1875,
4079
+ "step": 578
4080
+ },
4081
+ {
4082
+ "epoch": 0.5270224143816133,
4083
+ "grad_norm": 0.4133308231830597,
4084
+ "learning_rate": 9.293802081649243e-05,
4085
+ "loss": 2.0477,
4086
+ "step": 579
4087
+ },
4088
+ {
4089
+ "epoch": 0.527932643076573,
4090
+ "grad_norm": 0.427898645401001,
4091
+ "learning_rate": 9.265028668450402e-05,
4092
+ "loss": 2.0833,
4093
+ "step": 580
4094
+ },
4095
+ {
4096
+ "epoch": 0.5288428717715326,
4097
+ "grad_norm": 0.4321751892566681,
4098
+ "learning_rate": 9.23626137190794e-05,
4099
+ "loss": 2.1698,
4100
+ "step": 581
4101
+ },
4102
+ {
4103
+ "epoch": 0.5297531004664922,
4104
+ "grad_norm": 0.4715782105922699,
4105
+ "learning_rate": 9.207500431432115e-05,
4106
+ "loss": 2.1347,
4107
+ "step": 582
4108
+ },
4109
+ {
4110
+ "epoch": 0.5306633291614519,
4111
+ "grad_norm": 0.45599547028541565,
4112
+ "learning_rate": 9.178746086380275e-05,
4113
+ "loss": 2.1469,
4114
+ "step": 583
4115
+ },
4116
+ {
4117
+ "epoch": 0.5315735578564115,
4118
+ "grad_norm": 0.45286545157432556,
4119
+ "learning_rate": 9.149998576054874e-05,
4120
+ "loss": 2.2013,
4121
+ "step": 584
4122
+ },
4123
+ {
4124
+ "epoch": 0.5324837865513711,
4125
+ "grad_norm": 0.47089457511901855,
4126
+ "learning_rate": 9.121258139701502e-05,
4127
+ "loss": 2.2125,
4128
+ "step": 585
4129
+ },
4130
+ {
4131
+ "epoch": 0.5333940152463307,
4132
+ "grad_norm": 0.46750229597091675,
4133
+ "learning_rate": 9.092525016506858e-05,
4134
+ "loss": 2.1186,
4135
+ "step": 586
4136
+ },
4137
+ {
4138
+ "epoch": 0.5343042439412903,
4139
+ "grad_norm": 0.4931905269622803,
4140
+ "learning_rate": 9.063799445596795e-05,
4141
+ "loss": 2.2185,
4142
+ "step": 587
4143
+ },
4144
+ {
4145
+ "epoch": 0.5352144726362499,
4146
+ "grad_norm": 0.48538026213645935,
4147
+ "learning_rate": 9.035081666034304e-05,
4148
+ "loss": 2.2369,
4149
+ "step": 588
4150
+ },
4151
+ {
4152
+ "epoch": 0.5361247013312095,
4153
+ "grad_norm": 0.4944066107273102,
4154
+ "learning_rate": 9.006371916817534e-05,
4155
+ "loss": 2.2771,
4156
+ "step": 589
4157
+ },
4158
+ {
4159
+ "epoch": 0.5370349300261691,
4160
+ "grad_norm": 0.4564894139766693,
4161
+ "learning_rate": 8.977670436877811e-05,
4162
+ "loss": 2.0879,
4163
+ "step": 590
4164
+ },
4165
+ {
4166
+ "epoch": 0.5379451587211287,
4167
+ "grad_norm": 0.5046347379684448,
4168
+ "learning_rate": 8.948977465077632e-05,
4169
+ "loss": 2.2197,
4170
+ "step": 591
4171
+ },
4172
+ {
4173
+ "epoch": 0.5388553874160883,
4174
+ "grad_norm": 0.49683472514152527,
4175
+ "learning_rate": 8.920293240208694e-05,
4176
+ "loss": 2.2152,
4177
+ "step": 592
4178
+ },
4179
+ {
4180
+ "epoch": 0.5397656161110479,
4181
+ "grad_norm": 0.5223331451416016,
4182
+ "learning_rate": 8.891618000989891e-05,
4183
+ "loss": 2.3358,
4184
+ "step": 593
4185
+ },
4186
+ {
4187
+ "epoch": 0.5406758448060075,
4188
+ "grad_norm": 0.5552563667297363,
4189
+ "learning_rate": 8.862951986065345e-05,
4190
+ "loss": 2.1608,
4191
+ "step": 594
4192
+ },
4193
+ {
4194
+ "epoch": 0.5415860735009671,
4195
+ "grad_norm": 0.5853347778320312,
4196
+ "learning_rate": 8.83429543400241e-05,
4197
+ "loss": 2.3679,
4198
+ "step": 595
4199
+ },
4200
+ {
4201
+ "epoch": 0.5424963021959267,
4202
+ "grad_norm": 0.5858141183853149,
4203
+ "learning_rate": 8.805648583289674e-05,
4204
+ "loss": 2.341,
4205
+ "step": 596
4206
+ },
4207
+ {
4208
+ "epoch": 0.5434065308908863,
4209
+ "grad_norm": 0.6405509114265442,
4210
+ "learning_rate": 8.777011672335008e-05,
4211
+ "loss": 2.4773,
4212
+ "step": 597
4213
+ },
4214
+ {
4215
+ "epoch": 0.5443167595858459,
4216
+ "grad_norm": 0.7342801094055176,
4217
+ "learning_rate": 8.748384939463543e-05,
4218
+ "loss": 2.557,
4219
+ "step": 598
4220
+ },
4221
+ {
4222
+ "epoch": 0.5452269882808055,
4223
+ "grad_norm": 0.8813995122909546,
4224
+ "learning_rate": 8.719768622915714e-05,
4225
+ "loss": 2.5595,
4226
+ "step": 599
4227
+ },
4228
+ {
4229
+ "epoch": 0.5461372169757651,
4230
+ "grad_norm": 1.722114086151123,
4231
+ "learning_rate": 8.691162960845264e-05,
4232
+ "loss": 2.7211,
4233
+ "step": 600
4234
+ },
4235
+ {
4236
+ "epoch": 0.5470474456707247,
4237
+ "grad_norm": 0.424265056848526,
4238
+ "learning_rate": 8.662568191317273e-05,
4239
+ "loss": 2.3728,
4240
+ "step": 601
4241
+ },
4242
+ {
4243
+ "epoch": 0.5479576743656844,
4244
+ "grad_norm": 0.45933809876441956,
4245
+ "learning_rate": 8.633984552306164e-05,
4246
+ "loss": 2.4234,
4247
+ "step": 602
4248
+ },
4249
+ {
4250
+ "epoch": 0.548867903060644,
4251
+ "grad_norm": 0.45455530285835266,
4252
+ "learning_rate": 8.605412281693727e-05,
4253
+ "loss": 2.5062,
4254
+ "step": 603
4255
+ },
4256
+ {
4257
+ "epoch": 0.5497781317556036,
4258
+ "grad_norm": 0.4334143400192261,
4259
+ "learning_rate": 8.57685161726715e-05,
4260
+ "loss": 2.3087,
4261
+ "step": 604
4262
+ },
4263
+ {
4264
+ "epoch": 0.5506883604505632,
4265
+ "grad_norm": 0.4537433385848999,
4266
+ "learning_rate": 8.548302796717019e-05,
4267
+ "loss": 2.395,
4268
+ "step": 605
4269
+ },
4270
+ {
4271
+ "epoch": 0.5515985891455228,
4272
+ "grad_norm": 0.43673837184906006,
4273
+ "learning_rate": 8.519766057635355e-05,
4274
+ "loss": 2.3855,
4275
+ "step": 606
4276
+ },
4277
+ {
4278
+ "epoch": 0.5525088178404824,
4279
+ "grad_norm": 0.43078145384788513,
4280
+ "learning_rate": 8.491241637513644e-05,
4281
+ "loss": 2.2559,
4282
+ "step": 607
4283
+ },
4284
+ {
4285
+ "epoch": 0.553419046535442,
4286
+ "grad_norm": 0.4094640612602234,
4287
+ "learning_rate": 8.462729773740832e-05,
4288
+ "loss": 2.295,
4289
+ "step": 608
4290
+ },
4291
+ {
4292
+ "epoch": 0.5543292752304017,
4293
+ "grad_norm": 0.4126126170158386,
4294
+ "learning_rate": 8.434230703601384e-05,
4295
+ "loss": 2.2019,
4296
+ "step": 609
4297
+ },
4298
+ {
4299
+ "epoch": 0.5552395039253613,
4300
+ "grad_norm": 0.4372231066226959,
4301
+ "learning_rate": 8.405744664273278e-05,
4302
+ "loss": 2.4243,
4303
+ "step": 610
4304
+ },
4305
+ {
4306
+ "epoch": 0.5561497326203209,
4307
+ "grad_norm": 0.42160138487815857,
4308
+ "learning_rate": 8.37727189282606e-05,
4309
+ "loss": 2.2805,
4310
+ "step": 611
4311
+ },
4312
+ {
4313
+ "epoch": 0.5570599613152805,
4314
+ "grad_norm": 0.4336857795715332,
4315
+ "learning_rate": 8.34881262621884e-05,
4316
+ "loss": 2.4811,
4317
+ "step": 612
4318
+ },
4319
+ {
4320
+ "epoch": 0.5579701900102401,
4321
+ "grad_norm": 0.40520837903022766,
4322
+ "learning_rate": 8.320367101298351e-05,
4323
+ "loss": 2.1731,
4324
+ "step": 613
4325
+ },
4326
+ {
4327
+ "epoch": 0.5588804187051997,
4328
+ "grad_norm": 0.42664197087287903,
4329
+ "learning_rate": 8.291935554796962e-05,
4330
+ "loss": 2.3403,
4331
+ "step": 614
4332
+ },
4333
+ {
4334
+ "epoch": 0.5597906474001593,
4335
+ "grad_norm": 0.4109039902687073,
4336
+ "learning_rate": 8.263518223330697e-05,
4337
+ "loss": 2.2425,
4338
+ "step": 615
4339
+ },
4340
+ {
4341
+ "epoch": 0.5607008760951189,
4342
+ "grad_norm": 0.4032575786113739,
4343
+ "learning_rate": 8.235115343397295e-05,
4344
+ "loss": 2.2593,
4345
+ "step": 616
4346
+ },
4347
+ {
4348
+ "epoch": 0.5616111047900785,
4349
+ "grad_norm": 0.3929396867752075,
4350
+ "learning_rate": 8.206727151374207e-05,
4351
+ "loss": 2.0896,
4352
+ "step": 617
4353
+ },
4354
+ {
4355
+ "epoch": 0.5625213334850381,
4356
+ "grad_norm": 0.38567835092544556,
4357
+ "learning_rate": 8.178353883516664e-05,
4358
+ "loss": 2.0715,
4359
+ "step": 618
4360
+ },
4361
+ {
4362
+ "epoch": 0.5634315621799977,
4363
+ "grad_norm": 0.405369371175766,
4364
+ "learning_rate": 8.149995775955686e-05,
4365
+ "loss": 2.2249,
4366
+ "step": 619
4367
+ },
4368
+ {
4369
+ "epoch": 0.5643417908749573,
4370
+ "grad_norm": 0.3889697790145874,
4371
+ "learning_rate": 8.121653064696118e-05,
4372
+ "loss": 2.0797,
4373
+ "step": 620
4374
+ },
4375
+ {
4376
+ "epoch": 0.565252019569917,
4377
+ "grad_norm": 0.4065384864807129,
4378
+ "learning_rate": 8.093325985614685e-05,
4379
+ "loss": 2.2012,
4380
+ "step": 621
4381
+ },
4382
+ {
4383
+ "epoch": 0.5661622482648766,
4384
+ "grad_norm": 0.4066416323184967,
4385
+ "learning_rate": 8.065014774458003e-05,
4386
+ "loss": 2.1183,
4387
+ "step": 622
4388
+ },
4389
+ {
4390
+ "epoch": 0.5670724769598362,
4391
+ "grad_norm": 0.40575870871543884,
4392
+ "learning_rate": 8.036719666840647e-05,
4393
+ "loss": 2.0258,
4394
+ "step": 623
4395
+ },
4396
+ {
4397
+ "epoch": 0.5679827056547958,
4398
+ "grad_norm": 0.42911243438720703,
4399
+ "learning_rate": 8.008440898243149e-05,
4400
+ "loss": 2.1186,
4401
+ "step": 624
4402
+ },
4403
+ {
4404
+ "epoch": 0.5688929343497554,
4405
+ "grad_norm": 0.4009549021720886,
4406
+ "learning_rate": 7.980178704010089e-05,
4407
+ "loss": 2.0049,
4408
+ "step": 625
4409
+ },
4410
+ {
4411
+ "epoch": 0.569803163044715,
4412
+ "grad_norm": 0.41156989336013794,
4413
+ "learning_rate": 7.951933319348095e-05,
4414
+ "loss": 2.0272,
4415
+ "step": 626
4416
+ },
4417
+ {
4418
+ "epoch": 0.5707133917396746,
4419
+ "grad_norm": 0.4248954653739929,
4420
+ "learning_rate": 7.923704979323899e-05,
4421
+ "loss": 2.077,
4422
+ "step": 627
4423
+ },
4424
+ {
4425
+ "epoch": 0.5716236204346342,
4426
+ "grad_norm": 0.45484524965286255,
4427
+ "learning_rate": 7.895493918862396e-05,
4428
+ "loss": 2.2255,
4429
+ "step": 628
4430
+ },
4431
+ {
4432
+ "epoch": 0.5725338491295938,
4433
+ "grad_norm": 0.4571921229362488,
4434
+ "learning_rate": 7.867300372744657e-05,
4435
+ "loss": 2.1373,
4436
+ "step": 629
4437
+ },
4438
+ {
4439
+ "epoch": 0.5734440778245534,
4440
+ "grad_norm": 0.44238901138305664,
4441
+ "learning_rate": 7.839124575606004e-05,
4442
+ "loss": 2.1147,
4443
+ "step": 630
4444
+ },
4445
+ {
4446
+ "epoch": 0.574354306519513,
4447
+ "grad_norm": 0.4206310510635376,
4448
+ "learning_rate": 7.810966761934053e-05,
4449
+ "loss": 2.0508,
4450
+ "step": 631
4451
+ },
4452
+ {
4453
+ "epoch": 0.5752645352144726,
4454
+ "grad_norm": 0.43381330370903015,
4455
+ "learning_rate": 7.782827166066739e-05,
4456
+ "loss": 2.0847,
4457
+ "step": 632
4458
+ },
4459
+ {
4460
+ "epoch": 0.5761747639094322,
4461
+ "grad_norm": 0.4460139572620392,
4462
+ "learning_rate": 7.754706022190398e-05,
4463
+ "loss": 2.1288,
4464
+ "step": 633
4465
+ },
4466
+ {
4467
+ "epoch": 0.5770849926043918,
4468
+ "grad_norm": 0.4371720850467682,
4469
+ "learning_rate": 7.726603564337791e-05,
4470
+ "loss": 2.0476,
4471
+ "step": 634
4472
+ },
4473
+ {
4474
+ "epoch": 0.5779952212993514,
4475
+ "grad_norm": 0.4623599052429199,
4476
+ "learning_rate": 7.69852002638618e-05,
4477
+ "loss": 2.2858,
4478
+ "step": 635
4479
+ },
4480
+ {
4481
+ "epoch": 0.578905449994311,
4482
+ "grad_norm": 0.4422992765903473,
4483
+ "learning_rate": 7.670455642055361e-05,
4484
+ "loss": 2.1072,
4485
+ "step": 636
4486
+ },
4487
+ {
4488
+ "epoch": 0.5798156786892706,
4489
+ "grad_norm": 0.4804936647415161,
4490
+ "learning_rate": 7.642410644905726e-05,
4491
+ "loss": 2.2218,
4492
+ "step": 637
4493
+ },
4494
+ {
4495
+ "epoch": 0.5807259073842302,
4496
+ "grad_norm": 0.48249900341033936,
4497
+ "learning_rate": 7.614385268336336e-05,
4498
+ "loss": 2.2916,
4499
+ "step": 638
4500
+ },
4501
+ {
4502
+ "epoch": 0.5816361360791898,
4503
+ "grad_norm": 0.46635982394218445,
4504
+ "learning_rate": 7.586379745582944e-05,
4505
+ "loss": 2.1636,
4506
+ "step": 639
4507
+ },
4508
+ {
4509
+ "epoch": 0.5825463647741494,
4510
+ "grad_norm": 0.4670505225658417,
4511
+ "learning_rate": 7.558394309716088e-05,
4512
+ "loss": 2.2052,
4513
+ "step": 640
4514
+ },
4515
+ {
4516
+ "epoch": 0.5834565934691092,
4517
+ "grad_norm": 0.49475541710853577,
4518
+ "learning_rate": 7.530429193639128e-05,
4519
+ "loss": 2.18,
4520
+ "step": 641
4521
+ },
4522
+ {
4523
+ "epoch": 0.5843668221640688,
4524
+ "grad_norm": 0.5231596231460571,
4525
+ "learning_rate": 7.502484630086318e-05,
4526
+ "loss": 2.2095,
4527
+ "step": 642
4528
+ },
4529
+ {
4530
+ "epoch": 0.5852770508590284,
4531
+ "grad_norm": 0.5045900344848633,
4532
+ "learning_rate": 7.474560851620873e-05,
4533
+ "loss": 2.053,
4534
+ "step": 643
4535
+ },
4536
+ {
4537
+ "epoch": 0.586187279553988,
4538
+ "grad_norm": 0.5511046051979065,
4539
+ "learning_rate": 7.446658090633026e-05,
4540
+ "loss": 2.2706,
4541
+ "step": 644
4542
+ },
4543
+ {
4544
+ "epoch": 0.5870975082489476,
4545
+ "grad_norm": 0.5700446963310242,
4546
+ "learning_rate": 7.41877657933809e-05,
4547
+ "loss": 2.3999,
4548
+ "step": 645
4549
+ },
4550
+ {
4551
+ "epoch": 0.5880077369439072,
4552
+ "grad_norm": 0.5792605876922607,
4553
+ "learning_rate": 7.390916549774536e-05,
4554
+ "loss": 2.2391,
4555
+ "step": 646
4556
+ },
4557
+ {
4558
+ "epoch": 0.5889179656388668,
4559
+ "grad_norm": 0.6770455241203308,
4560
+ "learning_rate": 7.363078233802063e-05,
4561
+ "loss": 2.6564,
4562
+ "step": 647
4563
+ },
4564
+ {
4565
+ "epoch": 0.5898281943338264,
4566
+ "grad_norm": 0.7092955708503723,
4567
+ "learning_rate": 7.335261863099651e-05,
4568
+ "loss": 2.4722,
4569
+ "step": 648
4570
+ },
4571
+ {
4572
+ "epoch": 0.590738423028786,
4573
+ "grad_norm": 0.8125056028366089,
4574
+ "learning_rate": 7.307467669163655e-05,
4575
+ "loss": 2.3581,
4576
+ "step": 649
4577
+ },
4578
+ {
4579
+ "epoch": 0.5916486517237456,
4580
+ "grad_norm": 1.5941253900527954,
4581
+ "learning_rate": 7.279695883305866e-05,
4582
+ "loss": 2.16,
4583
+ "step": 650
4584
+ },
4585
+ {
4586
+ "epoch": 0.5925588804187052,
4587
+ "grad_norm": 0.44398507475852966,
4588
+ "learning_rate": 7.251946736651582e-05,
4589
+ "loss": 2.4689,
4590
+ "step": 651
4591
+ },
4592
+ {
4593
+ "epoch": 0.5934691091136648,
4594
+ "grad_norm": 0.4264509975910187,
4595
+ "learning_rate": 7.224220460137701e-05,
4596
+ "loss": 2.4081,
4597
+ "step": 652
4598
+ },
4599
+ {
4600
+ "epoch": 0.5943793378086244,
4601
+ "grad_norm": 0.4222484529018402,
4602
+ "learning_rate": 7.196517284510773e-05,
4603
+ "loss": 2.3827,
4604
+ "step": 653
4605
+ },
4606
+ {
4607
+ "epoch": 0.595289566503584,
4608
+ "grad_norm": 0.4516051411628723,
4609
+ "learning_rate": 7.168837440325114e-05,
4610
+ "loss": 2.399,
4611
+ "step": 654
4612
+ },
4613
+ {
4614
+ "epoch": 0.5961997951985436,
4615
+ "grad_norm": 0.4370306730270386,
4616
+ "learning_rate": 7.141181157940859e-05,
4617
+ "loss": 2.3837,
4618
+ "step": 655
4619
+ },
4620
+ {
4621
+ "epoch": 0.5971100238935032,
4622
+ "grad_norm": 0.4236253798007965,
4623
+ "learning_rate": 7.11354866752205e-05,
4624
+ "loss": 2.3066,
4625
+ "step": 656
4626
+ },
4627
+ {
4628
+ "epoch": 0.5980202525884628,
4629
+ "grad_norm": 0.41718846559524536,
4630
+ "learning_rate": 7.085940199034735e-05,
4631
+ "loss": 2.3841,
4632
+ "step": 657
4633
+ },
4634
+ {
4635
+ "epoch": 0.5989304812834224,
4636
+ "grad_norm": 0.4299750030040741,
4637
+ "learning_rate": 7.058355982245037e-05,
4638
+ "loss": 2.3842,
4639
+ "step": 658
4640
+ },
4641
+ {
4642
+ "epoch": 0.599840709978382,
4643
+ "grad_norm": 0.4180915057659149,
4644
+ "learning_rate": 7.030796246717255e-05,
4645
+ "loss": 2.0758,
4646
+ "step": 659
4647
+ },
4648
+ {
4649
+ "epoch": 0.6007509386733417,
4650
+ "grad_norm": 0.45195114612579346,
4651
+ "learning_rate": 7.003261221811934e-05,
4652
+ "loss": 2.4826,
4653
+ "step": 660
4654
+ },
4655
+ {
4656
+ "epoch": 0.6016611673683013,
4657
+ "grad_norm": 0.4253404140472412,
4658
+ "learning_rate": 6.97575113668399e-05,
4659
+ "loss": 2.3705,
4660
+ "step": 661
4661
+ },
4662
+ {
4663
+ "epoch": 0.6025713960632609,
4664
+ "grad_norm": 0.4198931157588959,
4665
+ "learning_rate": 6.948266220280771e-05,
4666
+ "loss": 2.3396,
4667
+ "step": 662
4668
+ },
4669
+ {
4670
+ "epoch": 0.6034816247582205,
4671
+ "grad_norm": 0.43457460403442383,
4672
+ "learning_rate": 6.920806701340155e-05,
4673
+ "loss": 2.1447,
4674
+ "step": 663
4675
+ },
4676
+ {
4677
+ "epoch": 0.6043918534531801,
4678
+ "grad_norm": 0.40161874890327454,
4679
+ "learning_rate": 6.893372808388675e-05,
4680
+ "loss": 2.2443,
4681
+ "step": 664
4682
+ },
4683
+ {
4684
+ "epoch": 0.6053020821481397,
4685
+ "grad_norm": 0.4039609432220459,
4686
+ "learning_rate": 6.865964769739575e-05,
4687
+ "loss": 2.1815,
4688
+ "step": 665
4689
+ },
4690
+ {
4691
+ "epoch": 0.6062123108430993,
4692
+ "grad_norm": 0.4061351716518402,
4693
+ "learning_rate": 6.838582813490947e-05,
4694
+ "loss": 2.1073,
4695
+ "step": 666
4696
+ },
4697
+ {
4698
+ "epoch": 0.607122539538059,
4699
+ "grad_norm": 0.4206211268901825,
4700
+ "learning_rate": 6.811227167523815e-05,
4701
+ "loss": 2.2549,
4702
+ "step": 667
4703
+ },
4704
+ {
4705
+ "epoch": 0.6080327682330186,
4706
+ "grad_norm": 0.3936857283115387,
4707
+ "learning_rate": 6.783898059500233e-05,
4708
+ "loss": 2.1373,
4709
+ "step": 668
4710
+ },
4711
+ {
4712
+ "epoch": 0.6089429969279782,
4713
+ "grad_norm": 0.3954029083251953,
4714
+ "learning_rate": 6.756595716861407e-05,
4715
+ "loss": 2.1001,
4716
+ "step": 669
4717
+ },
4718
+ {
4719
+ "epoch": 0.6098532256229378,
4720
+ "grad_norm": 0.407713919878006,
4721
+ "learning_rate": 6.729320366825784e-05,
4722
+ "loss": 2.0967,
4723
+ "step": 670
4724
+ },
4725
+ {
4726
+ "epoch": 0.6107634543178974,
4727
+ "grad_norm": 0.41096213459968567,
4728
+ "learning_rate": 6.702072236387182e-05,
4729
+ "loss": 2.0899,
4730
+ "step": 671
4731
+ },
4732
+ {
4733
+ "epoch": 0.611673683012857,
4734
+ "grad_norm": 0.40465790033340454,
4735
+ "learning_rate": 6.674851552312878e-05,
4736
+ "loss": 2.089,
4737
+ "step": 672
4738
+ },
4739
+ {
4740
+ "epoch": 0.6125839117078166,
4741
+ "grad_norm": 0.3991434574127197,
4742
+ "learning_rate": 6.647658541141735e-05,
4743
+ "loss": 1.9788,
4744
+ "step": 673
4745
+ },
4746
+ {
4747
+ "epoch": 0.6134941404027762,
4748
+ "grad_norm": 0.42327383160591125,
4749
+ "learning_rate": 6.620493429182323e-05,
4750
+ "loss": 2.1672,
4751
+ "step": 674
4752
+ },
4753
+ {
4754
+ "epoch": 0.6144043690977358,
4755
+ "grad_norm": 0.4061299264431,
4756
+ "learning_rate": 6.593356442511015e-05,
4757
+ "loss": 2.1617,
4758
+ "step": 675
4759
+ },
4760
+ {
4761
+ "epoch": 0.6153145977926954,
4762
+ "grad_norm": 0.4082658588886261,
4763
+ "learning_rate": 6.566247806970119e-05,
4764
+ "loss": 2.0112,
4765
+ "step": 676
4766
+ },
4767
+ {
4768
+ "epoch": 0.616224826487655,
4769
+ "grad_norm": 0.43016669154167175,
4770
+ "learning_rate": 6.539167748165994e-05,
4771
+ "loss": 2.0,
4772
+ "step": 677
4773
+ },
4774
+ {
4775
+ "epoch": 0.6171350551826146,
4776
+ "grad_norm": 0.43142226338386536,
4777
+ "learning_rate": 6.512116491467185e-05,
4778
+ "loss": 2.1585,
4779
+ "step": 678
4780
+ },
4781
+ {
4782
+ "epoch": 0.6180452838775743,
4783
+ "grad_norm": 0.4271491467952728,
4784
+ "learning_rate": 6.485094262002529e-05,
4785
+ "loss": 1.9628,
4786
+ "step": 679
4787
+ },
4788
+ {
4789
+ "epoch": 0.6189555125725339,
4790
+ "grad_norm": 0.44002971053123474,
4791
+ "learning_rate": 6.458101284659286e-05,
4792
+ "loss": 2.2214,
4793
+ "step": 680
4794
+ },
4795
+ {
4796
+ "epoch": 0.6198657412674935,
4797
+ "grad_norm": 0.4215126931667328,
4798
+ "learning_rate": 6.431137784081282e-05,
4799
+ "loss": 2.0377,
4800
+ "step": 681
4801
+ },
4802
+ {
4803
+ "epoch": 0.6207759699624531,
4804
+ "grad_norm": 0.46792343258857727,
4805
+ "learning_rate": 6.404203984667019e-05,
4806
+ "loss": 2.029,
4807
+ "step": 682
4808
+ },
4809
+ {
4810
+ "epoch": 0.6216861986574127,
4811
+ "grad_norm": 0.45737308263778687,
4812
+ "learning_rate": 6.377300110567821e-05,
4813
+ "loss": 2.2375,
4814
+ "step": 683
4815
+ },
4816
+ {
4817
+ "epoch": 0.6225964273523723,
4818
+ "grad_norm": 0.4526033401489258,
4819
+ "learning_rate": 6.350426385685957e-05,
4820
+ "loss": 2.2562,
4821
+ "step": 684
4822
+ },
4823
+ {
4824
+ "epoch": 0.6235066560473319,
4825
+ "grad_norm": 0.45917776226997375,
4826
+ "learning_rate": 6.323583033672799e-05,
4827
+ "loss": 2.1321,
4828
+ "step": 685
4829
+ },
4830
+ {
4831
+ "epoch": 0.6244168847422915,
4832
+ "grad_norm": 0.4713301658630371,
4833
+ "learning_rate": 6.296770277926937e-05,
4834
+ "loss": 2.07,
4835
+ "step": 686
4836
+ },
4837
+ {
4838
+ "epoch": 0.6253271134372511,
4839
+ "grad_norm": 0.5036799907684326,
4840
+ "learning_rate": 6.269988341592328e-05,
4841
+ "loss": 2.1103,
4842
+ "step": 687
4843
+ },
4844
+ {
4845
+ "epoch": 0.6262373421322107,
4846
+ "grad_norm": 0.4843004643917084,
4847
+ "learning_rate": 6.243237447556449e-05,
4848
+ "loss": 2.0936,
4849
+ "step": 688
4850
+ },
4851
+ {
4852
+ "epoch": 0.6271475708271703,
4853
+ "grad_norm": 0.4738497734069824,
4854
+ "learning_rate": 6.216517818448423e-05,
4855
+ "loss": 2.1004,
4856
+ "step": 689
4857
+ },
4858
+ {
4859
+ "epoch": 0.6280577995221299,
4860
+ "grad_norm": 0.5081862211227417,
4861
+ "learning_rate": 6.189829676637182e-05,
4862
+ "loss": 2.2177,
4863
+ "step": 690
4864
+ },
4865
+ {
4866
+ "epoch": 0.6289680282170895,
4867
+ "grad_norm": 0.5060831904411316,
4868
+ "learning_rate": 6.163173244229619e-05,
4869
+ "loss": 2.1342,
4870
+ "step": 691
4871
+ },
4872
+ {
4873
+ "epoch": 0.6298782569120491,
4874
+ "grad_norm": 0.5047332644462585,
4875
+ "learning_rate": 6.136548743068713e-05,
4876
+ "loss": 2.0727,
4877
+ "step": 692
4878
+ },
4879
+ {
4880
+ "epoch": 0.6307884856070087,
4881
+ "grad_norm": 0.5174648761749268,
4882
+ "learning_rate": 6.109956394731722e-05,
4883
+ "loss": 2.0623,
4884
+ "step": 693
4885
+ },
4886
+ {
4887
+ "epoch": 0.6316987143019683,
4888
+ "grad_norm": 0.5705139636993408,
4889
+ "learning_rate": 6.083396420528298e-05,
4890
+ "loss": 2.4454,
4891
+ "step": 694
4892
+ },
4893
+ {
4894
+ "epoch": 0.6326089429969279,
4895
+ "grad_norm": 0.5653088092803955,
4896
+ "learning_rate": 6.056869041498687e-05,
4897
+ "loss": 2.2071,
4898
+ "step": 695
4899
+ },
4900
+ {
4901
+ "epoch": 0.6335191716918875,
4902
+ "grad_norm": 0.6105946898460388,
4903
+ "learning_rate": 6.030374478411847e-05,
4904
+ "loss": 2.3081,
4905
+ "step": 696
4906
+ },
4907
+ {
4908
+ "epoch": 0.6344294003868471,
4909
+ "grad_norm": 0.6362658143043518,
4910
+ "learning_rate": 6.0039129517636435e-05,
4911
+ "loss": 2.3426,
4912
+ "step": 697
4913
+ },
4914
+ {
4915
+ "epoch": 0.6353396290818069,
4916
+ "grad_norm": 0.7242766618728638,
4917
+ "learning_rate": 5.9774846817750105e-05,
4918
+ "loss": 2.4877,
4919
+ "step": 698
4920
+ },
4921
+ {
4922
+ "epoch": 0.6362498577767665,
4923
+ "grad_norm": 0.9446219205856323,
4924
+ "learning_rate": 5.951089888390087e-05,
4925
+ "loss": 2.7741,
4926
+ "step": 699
4927
+ },
4928
+ {
4929
+ "epoch": 0.6371600864717261,
4930
+ "grad_norm": 1.5826219320297241,
4931
+ "learning_rate": 5.924728791274432e-05,
4932
+ "loss": 2.533,
4933
+ "step": 700
4934
+ },
4935
+ {
4936
+ "epoch": 0.6380703151666857,
4937
+ "grad_norm": 0.4994019567966461,
4938
+ "learning_rate": 5.89840160981316e-05,
4939
+ "loss": 2.4388,
4940
+ "step": 701
4941
+ },
4942
+ {
4943
+ "epoch": 0.6389805438616453,
4944
+ "grad_norm": 0.45213326811790466,
4945
+ "learning_rate": 5.872108563109131e-05,
4946
+ "loss": 2.3644,
4947
+ "step": 702
4948
+ },
4949
+ {
4950
+ "epoch": 0.6398907725566049,
4951
+ "grad_norm": 0.45655617117881775,
4952
+ "learning_rate": 5.845849869981137e-05,
4953
+ "loss": 2.5202,
4954
+ "step": 703
4955
+ },
4956
+ {
4957
+ "epoch": 0.6408010012515645,
4958
+ "grad_norm": 0.41640329360961914,
4959
+ "learning_rate": 5.819625748962049e-05,
4960
+ "loss": 2.3097,
4961
+ "step": 704
4962
+ },
4963
+ {
4964
+ "epoch": 0.6417112299465241,
4965
+ "grad_norm": 0.43625307083129883,
4966
+ "learning_rate": 5.79343641829704e-05,
4967
+ "loss": 2.3931,
4968
+ "step": 705
4969
+ },
4970
+ {
4971
+ "epoch": 0.6426214586414837,
4972
+ "grad_norm": 0.44178506731987,
4973
+ "learning_rate": 5.7672820959417254e-05,
4974
+ "loss": 2.3195,
4975
+ "step": 706
4976
+ },
4977
+ {
4978
+ "epoch": 0.6435316873364433,
4979
+ "grad_norm": 0.4416089951992035,
4980
+ "learning_rate": 5.741162999560386e-05,
4981
+ "loss": 2.2446,
4982
+ "step": 707
4983
+ },
4984
+ {
4985
+ "epoch": 0.6444419160314029,
4986
+ "grad_norm": 0.4419204890727997,
4987
+ "learning_rate": 5.7150793465241346e-05,
4988
+ "loss": 2.34,
4989
+ "step": 708
4990
+ },
4991
+ {
4992
+ "epoch": 0.6453521447263625,
4993
+ "grad_norm": 0.45422717928886414,
4994
+ "learning_rate": 5.68903135390912e-05,
4995
+ "loss": 2.1915,
4996
+ "step": 709
4997
+ },
4998
+ {
4999
+ "epoch": 0.6462623734213221,
5000
+ "grad_norm": 0.41635921597480774,
5001
+ "learning_rate": 5.663019238494704e-05,
5002
+ "loss": 2.3147,
5003
+ "step": 710
5004
+ },
5005
+ {
5006
+ "epoch": 0.6471726021162817,
5007
+ "grad_norm": 0.4240402579307556,
5008
+ "learning_rate": 5.637043216761678e-05,
5009
+ "loss": 2.1693,
5010
+ "step": 711
5011
+ },
5012
+ {
5013
+ "epoch": 0.6480828308112413,
5014
+ "grad_norm": 0.41989627480506897,
5015
+ "learning_rate": 5.611103504890444e-05,
5016
+ "loss": 2.2087,
5017
+ "step": 712
5018
+ },
5019
+ {
5020
+ "epoch": 0.6489930595062009,
5021
+ "grad_norm": 0.4187220335006714,
5022
+ "learning_rate": 5.5852003187592226e-05,
5023
+ "loss": 2.3818,
5024
+ "step": 713
5025
+ },
5026
+ {
5027
+ "epoch": 0.6499032882011605,
5028
+ "grad_norm": 0.43209201097488403,
5029
+ "learning_rate": 5.559333873942259e-05,
5030
+ "loss": 2.3176,
5031
+ "step": 714
5032
+ },
5033
+ {
5034
+ "epoch": 0.6508135168961201,
5035
+ "grad_norm": 0.416291207075119,
5036
+ "learning_rate": 5.533504385708024e-05,
5037
+ "loss": 2.2397,
5038
+ "step": 715
5039
+ },
5040
+ {
5041
+ "epoch": 0.6517237455910797,
5042
+ "grad_norm": 0.411857932806015,
5043
+ "learning_rate": 5.5077120690174246e-05,
5044
+ "loss": 2.1142,
5045
+ "step": 716
5046
+ },
5047
+ {
5048
+ "epoch": 0.6526339742860394,
5049
+ "grad_norm": 0.4142412543296814,
5050
+ "learning_rate": 5.481957138522018e-05,
5051
+ "loss": 2.2226,
5052
+ "step": 717
5053
+ },
5054
+ {
5055
+ "epoch": 0.653544202980999,
5056
+ "grad_norm": 0.4322018325328827,
5057
+ "learning_rate": 5.456239808562209e-05,
5058
+ "loss": 2.2078,
5059
+ "step": 718
5060
+ },
5061
+ {
5062
+ "epoch": 0.6544544316759586,
5063
+ "grad_norm": 0.42210081219673157,
5064
+ "learning_rate": 5.4305602931655045e-05,
5065
+ "loss": 2.0579,
5066
+ "step": 719
5067
+ },
5068
+ {
5069
+ "epoch": 0.6553646603709182,
5070
+ "grad_norm": 0.4076862335205078,
5071
+ "learning_rate": 5.404918806044679e-05,
5072
+ "loss": 2.1348,
5073
+ "step": 720
5074
+ },
5075
+ {
5076
+ "epoch": 0.6562748890658778,
5077
+ "grad_norm": 0.43610820174217224,
5078
+ "learning_rate": 5.379315560596038e-05,
5079
+ "loss": 2.2212,
5080
+ "step": 721
5081
+ },
5082
+ {
5083
+ "epoch": 0.6571851177608374,
5084
+ "grad_norm": 0.41793620586395264,
5085
+ "learning_rate": 5.3537507698976365e-05,
5086
+ "loss": 1.9606,
5087
+ "step": 722
5088
+ },
5089
+ {
5090
+ "epoch": 0.658095346455797,
5091
+ "grad_norm": 0.40680915117263794,
5092
+ "learning_rate": 5.328224646707479e-05,
5093
+ "loss": 2.0167,
5094
+ "step": 723
5095
+ },
5096
+ {
5097
+ "epoch": 0.6590055751507566,
5098
+ "grad_norm": 0.424699991941452,
5099
+ "learning_rate": 5.3027374034617785e-05,
5100
+ "loss": 2.1065,
5101
+ "step": 724
5102
+ },
5103
+ {
5104
+ "epoch": 0.6599158038457162,
5105
+ "grad_norm": 0.43258509039878845,
5106
+ "learning_rate": 5.277289252273174e-05,
5107
+ "loss": 2.0974,
5108
+ "step": 725
5109
+ },
5110
+ {
5111
+ "epoch": 0.6608260325406758,
5112
+ "grad_norm": 0.4318636655807495,
5113
+ "learning_rate": 5.251880404928971e-05,
5114
+ "loss": 2.3214,
5115
+ "step": 726
5116
+ },
5117
+ {
5118
+ "epoch": 0.6617362612356354,
5119
+ "grad_norm": 0.4147786498069763,
5120
+ "learning_rate": 5.226511072889371e-05,
5121
+ "loss": 2.1223,
5122
+ "step": 727
5123
+ },
5124
+ {
5125
+ "epoch": 0.662646489930595,
5126
+ "grad_norm": 0.4131387770175934,
5127
+ "learning_rate": 5.201181467285723e-05,
5128
+ "loss": 1.8335,
5129
+ "step": 728
5130
+ },
5131
+ {
5132
+ "epoch": 0.6635567186255547,
5133
+ "grad_norm": 0.456827849149704,
5134
+ "learning_rate": 5.175891798918757e-05,
5135
+ "loss": 2.1428,
5136
+ "step": 729
5137
+ },
5138
+ {
5139
+ "epoch": 0.6644669473205143,
5140
+ "grad_norm": 0.47478604316711426,
5141
+ "learning_rate": 5.1506422782568345e-05,
5142
+ "loss": 2.0526,
5143
+ "step": 730
5144
+ },
5145
+ {
5146
+ "epoch": 0.6653771760154739,
5147
+ "grad_norm": 0.4357281029224396,
5148
+ "learning_rate": 5.125433115434197e-05,
5149
+ "loss": 1.8949,
5150
+ "step": 731
5151
+ },
5152
+ {
5153
+ "epoch": 0.6662874047104335,
5154
+ "grad_norm": 0.45749080181121826,
5155
+ "learning_rate": 5.100264520249205e-05,
5156
+ "loss": 2.1637,
5157
+ "step": 732
5158
+ },
5159
+ {
5160
+ "epoch": 0.6671976334053931,
5161
+ "grad_norm": 0.47157686948776245,
5162
+ "learning_rate": 5.0751367021626215e-05,
5163
+ "loss": 2.1036,
5164
+ "step": 733
5165
+ },
5166
+ {
5167
+ "epoch": 0.6681078621003527,
5168
+ "grad_norm": 0.4490397274494171,
5169
+ "learning_rate": 5.050049870295841e-05,
5170
+ "loss": 1.9553,
5171
+ "step": 734
5172
+ },
5173
+ {
5174
+ "epoch": 0.6690180907953123,
5175
+ "grad_norm": 0.4798765480518341,
5176
+ "learning_rate": 5.025004233429145e-05,
5177
+ "loss": 2.0954,
5178
+ "step": 735
5179
+ },
5180
+ {
5181
+ "epoch": 0.669928319490272,
5182
+ "grad_norm": 0.5034172534942627,
5183
+ "learning_rate": 5.000000000000002e-05,
5184
+ "loss": 2.2027,
5185
+ "step": 736
5186
+ },
5187
+ {
5188
+ "epoch": 0.6708385481852316,
5189
+ "grad_norm": 0.481141060590744,
5190
+ "learning_rate": 4.9750373781012885e-05,
5191
+ "loss": 2.0822,
5192
+ "step": 737
5193
+ },
5194
+ {
5195
+ "epoch": 0.6717487768801912,
5196
+ "grad_norm": 0.49731481075286865,
5197
+ "learning_rate": 4.950116575479586e-05,
5198
+ "loss": 2.0196,
5199
+ "step": 738
5200
+ },
5201
+ {
5202
+ "epoch": 0.6726590055751508,
5203
+ "grad_norm": 0.48321208357810974,
5204
+ "learning_rate": 4.9252377995334444e-05,
5205
+ "loss": 1.9995,
5206
+ "step": 739
5207
+ },
5208
+ {
5209
+ "epoch": 0.6735692342701104,
5210
+ "grad_norm": 0.5173296332359314,
5211
+ "learning_rate": 4.90040125731165e-05,
5212
+ "loss": 2.2078,
5213
+ "step": 740
5214
+ },
5215
+ {
5216
+ "epoch": 0.67447946296507,
5217
+ "grad_norm": 0.5107961297035217,
5218
+ "learning_rate": 4.87560715551151e-05,
5219
+ "loss": 2.0684,
5220
+ "step": 741
5221
+ },
5222
+ {
5223
+ "epoch": 0.6753896916600296,
5224
+ "grad_norm": 0.522492527961731,
5225
+ "learning_rate": 4.85085570047713e-05,
5226
+ "loss": 2.2168,
5227
+ "step": 742
5228
+ },
5229
+ {
5230
+ "epoch": 0.6762999203549892,
5231
+ "grad_norm": 0.5417767763137817,
5232
+ "learning_rate": 4.826147098197691e-05,
5233
+ "loss": 2.2379,
5234
+ "step": 743
5235
+ },
5236
+ {
5237
+ "epoch": 0.6772101490499488,
5238
+ "grad_norm": 0.5735164284706116,
5239
+ "learning_rate": 4.8014815543057475e-05,
5240
+ "loss": 2.2132,
5241
+ "step": 744
5242
+ },
5243
+ {
5244
+ "epoch": 0.6781203777449084,
5245
+ "grad_norm": 0.5819071531295776,
5246
+ "learning_rate": 4.776859274075506e-05,
5247
+ "loss": 2.2469,
5248
+ "step": 745
5249
+ },
5250
+ {
5251
+ "epoch": 0.679030606439868,
5252
+ "grad_norm": 0.6113678216934204,
5253
+ "learning_rate": 4.752280462421117e-05,
5254
+ "loss": 2.3064,
5255
+ "step": 746
5256
+ },
5257
+ {
5258
+ "epoch": 0.6799408351348276,
5259
+ "grad_norm": 0.6506679654121399,
5260
+ "learning_rate": 4.727745323894976e-05,
5261
+ "loss": 2.3311,
5262
+ "step": 747
5263
+ },
5264
+ {
5265
+ "epoch": 0.6808510638297872,
5266
+ "grad_norm": 0.7372251152992249,
5267
+ "learning_rate": 4.703254062686017e-05,
5268
+ "loss": 2.575,
5269
+ "step": 748
5270
+ },
5271
+ {
5272
+ "epoch": 0.6817612925247468,
5273
+ "grad_norm": 0.8235337734222412,
5274
+ "learning_rate": 4.678806882618003e-05,
5275
+ "loss": 2.4711,
5276
+ "step": 749
5277
+ },
5278
+ {
5279
+ "epoch": 0.6826715212197064,
5280
+ "grad_norm": 1.295682430267334,
5281
+ "learning_rate": 4.654403987147865e-05,
5282
+ "loss": 2.5713,
5283
+ "step": 750
5284
+ },
5285
+ {
5286
+ "epoch": 0.683581749914666,
5287
+ "grad_norm": 0.46585023403167725,
5288
+ "learning_rate": 4.630045579363957e-05,
5289
+ "loss": 2.4203,
5290
+ "step": 751
5291
+ },
5292
+ {
5293
+ "epoch": 0.6844919786096256,
5294
+ "grad_norm": 0.4163340628147125,
5295
+ "learning_rate": 4.605731861984401e-05,
5296
+ "loss": 2.261,
5297
+ "step": 752
5298
+ },
5299
+ {
5300
+ "epoch": 0.6854022073045852,
5301
+ "grad_norm": 0.4268229007720947,
5302
+ "learning_rate": 4.5814630373554115e-05,
5303
+ "loss": 2.3078,
5304
+ "step": 753
5305
+ },
5306
+ {
5307
+ "epoch": 0.6863124359995448,
5308
+ "grad_norm": 0.43664565682411194,
5309
+ "learning_rate": 4.557239307449561e-05,
5310
+ "loss": 2.5044,
5311
+ "step": 754
5312
+ },
5313
+ {
5314
+ "epoch": 0.6872226646945045,
5315
+ "grad_norm": 0.44144201278686523,
5316
+ "learning_rate": 4.5330608738641486e-05,
5317
+ "loss": 2.4192,
5318
+ "step": 755
5319
+ },
5320
+ {
5321
+ "epoch": 0.6881328933894642,
5322
+ "grad_norm": 0.40867024660110474,
5323
+ "learning_rate": 4.508927937819499e-05,
5324
+ "loss": 2.1908,
5325
+ "step": 756
5326
+ },
5327
+ {
5328
+ "epoch": 0.6890431220844238,
5329
+ "grad_norm": 0.4147084057331085,
5330
+ "learning_rate": 4.484840700157295e-05,
5331
+ "loss": 2.2864,
5332
+ "step": 757
5333
+ },
5334
+ {
5335
+ "epoch": 0.6899533507793834,
5336
+ "grad_norm": 0.413703054189682,
5337
+ "learning_rate": 4.4607993613388976e-05,
5338
+ "loss": 2.2436,
5339
+ "step": 758
5340
+ },
5341
+ {
5342
+ "epoch": 0.690863579474343,
5343
+ "grad_norm": 0.42309460043907166,
5344
+ "learning_rate": 4.436804121443689e-05,
5345
+ "loss": 2.3444,
5346
+ "step": 759
5347
+ },
5348
+ {
5349
+ "epoch": 0.6917738081693026,
5350
+ "grad_norm": 0.4229698181152344,
5351
+ "learning_rate": 4.412855180167406e-05,
5352
+ "loss": 2.3264,
5353
+ "step": 760
5354
+ },
5355
+ {
5356
+ "epoch": 0.6926840368642622,
5357
+ "grad_norm": 0.4247763156890869,
5358
+ "learning_rate": 4.388952736820453e-05,
5359
+ "loss": 2.257,
5360
+ "step": 761
5361
+ },
5362
+ {
5363
+ "epoch": 0.6935942655592218,
5364
+ "grad_norm": 0.42383337020874023,
5365
+ "learning_rate": 4.365096990326297e-05,
5366
+ "loss": 2.1349,
5367
+ "step": 762
5368
+ },
5369
+ {
5370
+ "epoch": 0.6945044942541814,
5371
+ "grad_norm": 0.4181368350982666,
5372
+ "learning_rate": 4.3412881392197526e-05,
5373
+ "loss": 2.2587,
5374
+ "step": 763
5375
+ },
5376
+ {
5377
+ "epoch": 0.695414722949141,
5378
+ "grad_norm": 0.4086921811103821,
5379
+ "learning_rate": 4.317526381645363e-05,
5380
+ "loss": 2.2378,
5381
+ "step": 764
5382
+ },
5383
+ {
5384
+ "epoch": 0.6963249516441006,
5385
+ "grad_norm": 0.4001654088497162,
5386
+ "learning_rate": 4.293811915355761e-05,
5387
+ "loss": 2.1708,
5388
+ "step": 765
5389
+ },
5390
+ {
5391
+ "epoch": 0.6972351803390602,
5392
+ "grad_norm": 0.42960214614868164,
5393
+ "learning_rate": 4.270144937709981e-05,
5394
+ "loss": 2.1556,
5395
+ "step": 766
5396
+ },
5397
+ {
5398
+ "epoch": 0.6981454090340198,
5399
+ "grad_norm": 0.42000848054885864,
5400
+ "learning_rate": 4.2465256456718615e-05,
5401
+ "loss": 2.1182,
5402
+ "step": 767
5403
+ },
5404
+ {
5405
+ "epoch": 0.6990556377289794,
5406
+ "grad_norm": 0.394368052482605,
5407
+ "learning_rate": 4.222954235808378e-05,
5408
+ "loss": 2.0486,
5409
+ "step": 768
5410
+ },
5411
+ {
5412
+ "epoch": 0.699965866423939,
5413
+ "grad_norm": 0.3934192359447479,
5414
+ "learning_rate": 4.19943090428802e-05,
5415
+ "loss": 1.9222,
5416
+ "step": 769
5417
+ },
5418
+ {
5419
+ "epoch": 0.7008760951188986,
5420
+ "grad_norm": 0.44399651885032654,
5421
+ "learning_rate": 4.175955846879151e-05,
5422
+ "loss": 2.1621,
5423
+ "step": 770
5424
+ },
5425
+ {
5426
+ "epoch": 0.7017863238138582,
5427
+ "grad_norm": 0.4081316888332367,
5428
+ "learning_rate": 4.1525292589483843e-05,
5429
+ "loss": 1.9534,
5430
+ "step": 771
5431
+ },
5432
+ {
5433
+ "epoch": 0.7026965525088178,
5434
+ "grad_norm": 0.42008188366889954,
5435
+ "learning_rate": 4.129151335458957e-05,
5436
+ "loss": 1.9773,
5437
+ "step": 772
5438
+ },
5439
+ {
5440
+ "epoch": 0.7036067812037774,
5441
+ "grad_norm": 0.4209793508052826,
5442
+ "learning_rate": 4.105822270969102e-05,
5443
+ "loss": 2.0403,
5444
+ "step": 773
5445
+ },
5446
+ {
5447
+ "epoch": 0.7045170098987371,
5448
+ "grad_norm": 0.43969592452049255,
5449
+ "learning_rate": 4.0825422596304396e-05,
5450
+ "loss": 2.1796,
5451
+ "step": 774
5452
+ },
5453
+ {
5454
+ "epoch": 0.7054272385936967,
5455
+ "grad_norm": 0.4333605468273163,
5456
+ "learning_rate": 4.059311495186338e-05,
5457
+ "loss": 2.1115,
5458
+ "step": 775
5459
+ },
5460
+ {
5461
+ "epoch": 0.7063374672886563,
5462
+ "grad_norm": 0.42669251561164856,
5463
+ "learning_rate": 4.036130170970341e-05,
5464
+ "loss": 2.1563,
5465
+ "step": 776
5466
+ },
5467
+ {
5468
+ "epoch": 0.7072476959836159,
5469
+ "grad_norm": 0.45064857602119446,
5470
+ "learning_rate": 4.012998479904525e-05,
5471
+ "loss": 2.2003,
5472
+ "step": 777
5473
+ },
5474
+ {
5475
+ "epoch": 0.7081579246785755,
5476
+ "grad_norm": 0.4487575888633728,
5477
+ "learning_rate": 3.9899166144978904e-05,
5478
+ "loss": 2.1332,
5479
+ "step": 778
5480
+ },
5481
+ {
5482
+ "epoch": 0.7090681533735351,
5483
+ "grad_norm": 0.4423072636127472,
5484
+ "learning_rate": 3.966884766844803e-05,
5485
+ "loss": 2.1449,
5486
+ "step": 779
5487
+ },
5488
+ {
5489
+ "epoch": 0.7099783820684947,
5490
+ "grad_norm": 0.4436761736869812,
5491
+ "learning_rate": 3.943903128623335e-05,
5492
+ "loss": 2.0025,
5493
+ "step": 780
5494
+ },
5495
+ {
5496
+ "epoch": 0.7108886107634543,
5497
+ "grad_norm": 0.4364500045776367,
5498
+ "learning_rate": 3.920971891093718e-05,
5499
+ "loss": 1.9834,
5500
+ "step": 781
5501
+ },
5502
+ {
5503
+ "epoch": 0.7117988394584139,
5504
+ "grad_norm": 0.44009071588516235,
5505
+ "learning_rate": 3.8980912450967366e-05,
5506
+ "loss": 2.0204,
5507
+ "step": 782
5508
+ },
5509
+ {
5510
+ "epoch": 0.7127090681533735,
5511
+ "grad_norm": 0.4498206079006195,
5512
+ "learning_rate": 3.875261381052121e-05,
5513
+ "loss": 2.0348,
5514
+ "step": 783
5515
+ },
5516
+ {
5517
+ "epoch": 0.7136192968483331,
5518
+ "grad_norm": 0.48029908537864685,
5519
+ "learning_rate": 3.852482488956992e-05,
5520
+ "loss": 2.0383,
5521
+ "step": 784
5522
+ },
5523
+ {
5524
+ "epoch": 0.7145295255432927,
5525
+ "grad_norm": 0.4986077845096588,
5526
+ "learning_rate": 3.829754758384262e-05,
5527
+ "loss": 2.3006,
5528
+ "step": 785
5529
+ },
5530
+ {
5531
+ "epoch": 0.7154397542382523,
5532
+ "grad_norm": 0.5001522302627563,
5533
+ "learning_rate": 3.807078378481059e-05,
5534
+ "loss": 2.3416,
5535
+ "step": 786
5536
+ },
5537
+ {
5538
+ "epoch": 0.716349982933212,
5539
+ "grad_norm": 0.5049505829811096,
5540
+ "learning_rate": 3.784453537967161e-05,
5541
+ "loss": 2.1652,
5542
+ "step": 787
5543
+ },
5544
+ {
5545
+ "epoch": 0.7172602116281716,
5546
+ "grad_norm": 0.5048404932022095,
5547
+ "learning_rate": 3.761880425133413e-05,
5548
+ "loss": 2.1345,
5549
+ "step": 788
5550
+ },
5551
+ {
5552
+ "epoch": 0.7181704403231312,
5553
+ "grad_norm": 0.4869529604911804,
5554
+ "learning_rate": 3.7393592278401704e-05,
5555
+ "loss": 2.0906,
5556
+ "step": 789
5557
+ },
5558
+ {
5559
+ "epoch": 0.7190806690180908,
5560
+ "grad_norm": 0.5454188585281372,
5561
+ "learning_rate": 3.7168901335157315e-05,
5562
+ "loss": 2.4214,
5563
+ "step": 790
5564
+ },
5565
+ {
5566
+ "epoch": 0.7199908977130504,
5567
+ "grad_norm": 0.5238876938819885,
5568
+ "learning_rate": 3.694473329154778e-05,
5569
+ "loss": 1.9798,
5570
+ "step": 791
5571
+ },
5572
+ {
5573
+ "epoch": 0.72090112640801,
5574
+ "grad_norm": 0.5545910596847534,
5575
+ "learning_rate": 3.672109001316809e-05,
5576
+ "loss": 2.4726,
5577
+ "step": 792
5578
+ },
5579
+ {
5580
+ "epoch": 0.7218113551029697,
5581
+ "grad_norm": 0.542072594165802,
5582
+ "learning_rate": 3.649797336124615e-05,
5583
+ "loss": 2.016,
5584
+ "step": 793
5585
+ },
5586
+ {
5587
+ "epoch": 0.7227215837979293,
5588
+ "grad_norm": 0.5355279445648193,
5589
+ "learning_rate": 3.6275385192627056e-05,
5590
+ "loss": 2.1041,
5591
+ "step": 794
5592
+ },
5593
+ {
5594
+ "epoch": 0.7236318124928889,
5595
+ "grad_norm": 0.5673330426216125,
5596
+ "learning_rate": 3.6053327359757535e-05,
5597
+ "loss": 2.1006,
5598
+ "step": 795
5599
+ },
5600
+ {
5601
+ "epoch": 0.7245420411878485,
5602
+ "grad_norm": 0.6170483231544495,
5603
+ "learning_rate": 3.583180171067101e-05,
5604
+ "loss": 2.3275,
5605
+ "step": 796
5606
+ },
5607
+ {
5608
+ "epoch": 0.7254522698828081,
5609
+ "grad_norm": 0.6877503991127014,
5610
+ "learning_rate": 3.5610810088971625e-05,
5611
+ "loss": 2.4504,
5612
+ "step": 797
5613
+ },
5614
+ {
5615
+ "epoch": 0.7263624985777677,
5616
+ "grad_norm": 0.7676892280578613,
5617
+ "learning_rate": 3.5390354333819344e-05,
5618
+ "loss": 2.6627,
5619
+ "step": 798
5620
+ },
5621
+ {
5622
+ "epoch": 0.7272727272727273,
5623
+ "grad_norm": 0.9272985458374023,
5624
+ "learning_rate": 3.517043627991441e-05,
5625
+ "loss": 2.5253,
5626
+ "step": 799
5627
+ },
5628
+ {
5629
+ "epoch": 0.7281829559676869,
5630
+ "grad_norm": 1.9746160507202148,
5631
+ "learning_rate": 3.4951057757482205e-05,
5632
+ "loss": 2.5993,
5633
+ "step": 800
5634
+ },
5635
+ {
5636
+ "epoch": 0.7290931846626465,
5637
+ "grad_norm": 0.4283389449119568,
5638
+ "learning_rate": 3.4732220592257946e-05,
5639
+ "loss": 2.5104,
5640
+ "step": 801
5641
+ },
5642
+ {
5643
+ "epoch": 0.7300034133576061,
5644
+ "grad_norm": 0.43243661522865295,
5645
+ "learning_rate": 3.45139266054715e-05,
5646
+ "loss": 2.1707,
5647
+ "step": 802
5648
+ },
5649
+ {
5650
+ "epoch": 0.7309136420525657,
5651
+ "grad_norm": 0.4262010455131531,
5652
+ "learning_rate": 3.429617761383222e-05,
5653
+ "loss": 2.2513,
5654
+ "step": 803
5655
+ },
5656
+ {
5657
+ "epoch": 0.7318238707475253,
5658
+ "grad_norm": 0.4308786392211914,
5659
+ "learning_rate": 3.40789754295139e-05,
5660
+ "loss": 2.4042,
5661
+ "step": 804
5662
+ },
5663
+ {
5664
+ "epoch": 0.7327340994424849,
5665
+ "grad_norm": 0.42787450551986694,
5666
+ "learning_rate": 3.3862321860139576e-05,
5667
+ "loss": 2.4259,
5668
+ "step": 805
5669
+ },
5670
+ {
5671
+ "epoch": 0.7336443281374445,
5672
+ "grad_norm": 0.4381856620311737,
5673
+ "learning_rate": 3.364621870876659e-05,
5674
+ "loss": 2.4046,
5675
+ "step": 806
5676
+ },
5677
+ {
5678
+ "epoch": 0.7345545568324041,
5679
+ "grad_norm": 0.4300837814807892,
5680
+ "learning_rate": 3.343066777387148e-05,
5681
+ "loss": 2.3713,
5682
+ "step": 807
5683
+ },
5684
+ {
5685
+ "epoch": 0.7354647855273637,
5686
+ "grad_norm": 0.4228179156780243,
5687
+ "learning_rate": 3.3215670849335155e-05,
5688
+ "loss": 2.2606,
5689
+ "step": 808
5690
+ },
5691
+ {
5692
+ "epoch": 0.7363750142223233,
5693
+ "grad_norm": 0.4267946779727936,
5694
+ "learning_rate": 3.300122972442773e-05,
5695
+ "loss": 2.3377,
5696
+ "step": 809
5697
+ },
5698
+ {
5699
+ "epoch": 0.7372852429172829,
5700
+ "grad_norm": 0.41019585728645325,
5701
+ "learning_rate": 3.278734618379402e-05,
5702
+ "loss": 2.1903,
5703
+ "step": 810
5704
+ },
5705
+ {
5706
+ "epoch": 0.7381954716122425,
5707
+ "grad_norm": 0.42949211597442627,
5708
+ "learning_rate": 3.257402200743821e-05,
5709
+ "loss": 2.3309,
5710
+ "step": 811
5711
+ },
5712
+ {
5713
+ "epoch": 0.7391057003072021,
5714
+ "grad_norm": 0.4410031735897064,
5715
+ "learning_rate": 3.2361258970709397e-05,
5716
+ "loss": 2.3924,
5717
+ "step": 812
5718
+ },
5719
+ {
5720
+ "epoch": 0.7400159290021618,
5721
+ "grad_norm": 0.4223659336566925,
5722
+ "learning_rate": 3.21490588442868e-05,
5723
+ "loss": 2.2614,
5724
+ "step": 813
5725
+ },
5726
+ {
5727
+ "epoch": 0.7409261576971214,
5728
+ "grad_norm": 0.43348926305770874,
5729
+ "learning_rate": 3.19374233941647e-05,
5730
+ "loss": 2.4255,
5731
+ "step": 814
5732
+ },
5733
+ {
5734
+ "epoch": 0.741836386392081,
5735
+ "grad_norm": 0.42184650897979736,
5736
+ "learning_rate": 3.172635438163816e-05,
5737
+ "loss": 2.2794,
5738
+ "step": 815
5739
+ },
5740
+ {
5741
+ "epoch": 0.7427466150870407,
5742
+ "grad_norm": 0.4127393066883087,
5743
+ "learning_rate": 3.1515853563288076e-05,
5744
+ "loss": 2.1242,
5745
+ "step": 816
5746
+ },
5747
+ {
5748
+ "epoch": 0.7436568437820003,
5749
+ "grad_norm": 0.40478646755218506,
5750
+ "learning_rate": 3.130592269096671e-05,
5751
+ "loss": 2.035,
5752
+ "step": 817
5753
+ },
5754
+ {
5755
+ "epoch": 0.7445670724769599,
5756
+ "grad_norm": 0.41883495450019836,
5757
+ "learning_rate": 3.1096563511783014e-05,
5758
+ "loss": 2.1427,
5759
+ "step": 818
5760
+ },
5761
+ {
5762
+ "epoch": 0.7454773011719195,
5763
+ "grad_norm": 0.39868757128715515,
5764
+ "learning_rate": 3.08877777680882e-05,
5765
+ "loss": 2.0169,
5766
+ "step": 819
5767
+ },
5768
+ {
5769
+ "epoch": 0.7463875298668791,
5770
+ "grad_norm": 0.392415851354599,
5771
+ "learning_rate": 3.0679567197461134e-05,
5772
+ "loss": 2.0969,
5773
+ "step": 820
5774
+ },
5775
+ {
5776
+ "epoch": 0.7472977585618387,
5777
+ "grad_norm": 0.4181436598300934,
5778
+ "learning_rate": 3.047193353269382e-05,
5779
+ "loss": 2.1766,
5780
+ "step": 821
5781
+ },
5782
+ {
5783
+ "epoch": 0.7482079872567983,
5784
+ "grad_norm": 0.41824692487716675,
5785
+ "learning_rate": 3.0264878501777306e-05,
5786
+ "loss": 2.0897,
5787
+ "step": 822
5788
+ },
5789
+ {
5790
+ "epoch": 0.7491182159517579,
5791
+ "grad_norm": 0.442640095949173,
5792
+ "learning_rate": 3.005840382788685e-05,
5793
+ "loss": 2.0851,
5794
+ "step": 823
5795
+ },
5796
+ {
5797
+ "epoch": 0.7500284446467175,
5798
+ "grad_norm": 0.43662169575691223,
5799
+ "learning_rate": 2.9852511229367865e-05,
5800
+ "loss": 2.1546,
5801
+ "step": 824
5802
+ },
5803
+ {
5804
+ "epoch": 0.7509386733416771,
5805
+ "grad_norm": 0.44712746143341064,
5806
+ "learning_rate": 2.9647202419721687e-05,
5807
+ "loss": 2.2304,
5808
+ "step": 825
5809
+ },
5810
+ {
5811
+ "epoch": 0.7509386733416771,
5812
+ "eval_loss": 2.2184386253356934,
5813
+ "eval_runtime": 205.4094,
5814
+ "eval_samples_per_second": 9.011,
5815
+ "eval_steps_per_second": 4.508,
5816
+ "step": 825
5817
  }
5818
  ],
5819
  "logging_steps": 1,
 
5833
  "attributes": {}
5834
  }
5835
  },
5836
+ "total_flos": 1.5204499502137344e+18,
5837
  "train_batch_size": 2,
5838
  "trial_name": null,
5839
  "trial_params": null