Light-R1-7B-DS / trainer_state.json
zhs12's picture
Upload folder using huggingface_hub
31cd155 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.0,
"eval_steps": 500,
"global_step": 999,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009009009009009009,
"grad_norm": 1.7692377372839305,
"learning_rate": 5e-06,
"loss": 0.3442,
"step": 1
},
{
"epoch": 0.018018018018018018,
"grad_norm": 1.655774168281545,
"learning_rate": 5e-06,
"loss": 0.374,
"step": 2
},
{
"epoch": 0.02702702702702703,
"grad_norm": 1.5838611317965265,
"learning_rate": 5e-06,
"loss": 0.3763,
"step": 3
},
{
"epoch": 0.036036036036036036,
"grad_norm": 1.4722490643600856,
"learning_rate": 5e-06,
"loss": 0.3667,
"step": 4
},
{
"epoch": 0.04504504504504504,
"grad_norm": 0.967982129724269,
"learning_rate": 5e-06,
"loss": 0.3378,
"step": 5
},
{
"epoch": 0.05405405405405406,
"grad_norm": 0.9655512366546067,
"learning_rate": 5e-06,
"loss": 0.325,
"step": 6
},
{
"epoch": 0.06306306306306306,
"grad_norm": 0.7980444967597017,
"learning_rate": 5e-06,
"loss": 0.3418,
"step": 7
},
{
"epoch": 0.07207207207207207,
"grad_norm": 0.5186921114651042,
"learning_rate": 5e-06,
"loss": 0.3508,
"step": 8
},
{
"epoch": 0.08108108108108109,
"grad_norm": 0.5518215648538942,
"learning_rate": 5e-06,
"loss": 0.348,
"step": 9
},
{
"epoch": 0.09009009009009009,
"grad_norm": 0.6622509236535837,
"learning_rate": 5e-06,
"loss": 0.3539,
"step": 10
},
{
"epoch": 0.0990990990990991,
"grad_norm": 0.6537261351376887,
"learning_rate": 5e-06,
"loss": 0.3364,
"step": 11
},
{
"epoch": 0.10810810810810811,
"grad_norm": 0.6557224204301801,
"learning_rate": 5e-06,
"loss": 0.3529,
"step": 12
},
{
"epoch": 0.11711711711711711,
"grad_norm": 0.6677743317643713,
"learning_rate": 5e-06,
"loss": 0.3233,
"step": 13
},
{
"epoch": 0.12612612612612611,
"grad_norm": 0.5771734482767436,
"learning_rate": 5e-06,
"loss": 0.3433,
"step": 14
},
{
"epoch": 0.13513513513513514,
"grad_norm": 0.5194262746227281,
"learning_rate": 5e-06,
"loss": 0.2968,
"step": 15
},
{
"epoch": 0.14414414414414414,
"grad_norm": 0.5871866323370637,
"learning_rate": 5e-06,
"loss": 0.3177,
"step": 16
},
{
"epoch": 0.15315315315315314,
"grad_norm": 0.6823752349157315,
"learning_rate": 5e-06,
"loss": 0.3217,
"step": 17
},
{
"epoch": 0.16216216216216217,
"grad_norm": 0.5298328303770766,
"learning_rate": 5e-06,
"loss": 0.333,
"step": 18
},
{
"epoch": 0.17117117117117117,
"grad_norm": 0.45672917289622006,
"learning_rate": 5e-06,
"loss": 0.2998,
"step": 19
},
{
"epoch": 0.18018018018018017,
"grad_norm": 0.4892453526407057,
"learning_rate": 5e-06,
"loss": 0.3301,
"step": 20
},
{
"epoch": 0.1891891891891892,
"grad_norm": 0.4157035225188495,
"learning_rate": 5e-06,
"loss": 0.3097,
"step": 21
},
{
"epoch": 0.1981981981981982,
"grad_norm": 0.42144355038756004,
"learning_rate": 5e-06,
"loss": 0.336,
"step": 22
},
{
"epoch": 0.2072072072072072,
"grad_norm": 0.40449172267977285,
"learning_rate": 5e-06,
"loss": 0.3203,
"step": 23
},
{
"epoch": 0.21621621621621623,
"grad_norm": 0.3817504264369776,
"learning_rate": 5e-06,
"loss": 0.3282,
"step": 24
},
{
"epoch": 0.22522522522522523,
"grad_norm": 0.37458931065383283,
"learning_rate": 5e-06,
"loss": 0.3427,
"step": 25
},
{
"epoch": 0.23423423423423423,
"grad_norm": 0.43415654347436194,
"learning_rate": 5e-06,
"loss": 0.3361,
"step": 26
},
{
"epoch": 0.24324324324324326,
"grad_norm": 0.34734907350951355,
"learning_rate": 5e-06,
"loss": 0.3081,
"step": 27
},
{
"epoch": 0.25225225225225223,
"grad_norm": 0.3446691978222806,
"learning_rate": 5e-06,
"loss": 0.3104,
"step": 28
},
{
"epoch": 0.26126126126126126,
"grad_norm": 0.3219457244434707,
"learning_rate": 5e-06,
"loss": 0.3154,
"step": 29
},
{
"epoch": 0.2702702702702703,
"grad_norm": 0.35333024684448033,
"learning_rate": 5e-06,
"loss": 0.3238,
"step": 30
},
{
"epoch": 0.27927927927927926,
"grad_norm": 0.38018940900412435,
"learning_rate": 5e-06,
"loss": 0.3479,
"step": 31
},
{
"epoch": 0.2882882882882883,
"grad_norm": 0.4160537077429581,
"learning_rate": 5e-06,
"loss": 0.3225,
"step": 32
},
{
"epoch": 0.2972972972972973,
"grad_norm": 0.4001899610048794,
"learning_rate": 5e-06,
"loss": 0.3378,
"step": 33
},
{
"epoch": 0.3063063063063063,
"grad_norm": 0.3966450451230361,
"learning_rate": 5e-06,
"loss": 0.3136,
"step": 34
},
{
"epoch": 0.3153153153153153,
"grad_norm": 0.35442342787868963,
"learning_rate": 5e-06,
"loss": 0.3272,
"step": 35
},
{
"epoch": 0.32432432432432434,
"grad_norm": 0.31417075347024526,
"learning_rate": 5e-06,
"loss": 0.34,
"step": 36
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.31460209634883374,
"learning_rate": 5e-06,
"loss": 0.3068,
"step": 37
},
{
"epoch": 0.34234234234234234,
"grad_norm": 0.27638346613404846,
"learning_rate": 5e-06,
"loss": 0.3355,
"step": 38
},
{
"epoch": 0.35135135135135137,
"grad_norm": 0.31966876717000925,
"learning_rate": 5e-06,
"loss": 0.3176,
"step": 39
},
{
"epoch": 0.36036036036036034,
"grad_norm": 0.2841478766107157,
"learning_rate": 5e-06,
"loss": 0.3031,
"step": 40
},
{
"epoch": 0.36936936936936937,
"grad_norm": 0.30103555060540843,
"learning_rate": 5e-06,
"loss": 0.3149,
"step": 41
},
{
"epoch": 0.3783783783783784,
"grad_norm": 0.3245469606074819,
"learning_rate": 5e-06,
"loss": 0.3147,
"step": 42
},
{
"epoch": 0.38738738738738737,
"grad_norm": 0.32434042643762057,
"learning_rate": 5e-06,
"loss": 0.3305,
"step": 43
},
{
"epoch": 0.3963963963963964,
"grad_norm": 0.27778799916309627,
"learning_rate": 5e-06,
"loss": 0.3384,
"step": 44
},
{
"epoch": 0.40540540540540543,
"grad_norm": 0.2745056010877783,
"learning_rate": 5e-06,
"loss": 0.3156,
"step": 45
},
{
"epoch": 0.4144144144144144,
"grad_norm": 0.29410832050755714,
"learning_rate": 5e-06,
"loss": 0.3112,
"step": 46
},
{
"epoch": 0.42342342342342343,
"grad_norm": 0.39421096404575884,
"learning_rate": 5e-06,
"loss": 0.3504,
"step": 47
},
{
"epoch": 0.43243243243243246,
"grad_norm": 0.30297987367745016,
"learning_rate": 5e-06,
"loss": 0.3133,
"step": 48
},
{
"epoch": 0.44144144144144143,
"grad_norm": 0.312599049596589,
"learning_rate": 5e-06,
"loss": 0.3298,
"step": 49
},
{
"epoch": 0.45045045045045046,
"grad_norm": 0.28890139188869196,
"learning_rate": 5e-06,
"loss": 0.3016,
"step": 50
},
{
"epoch": 0.4594594594594595,
"grad_norm": 0.27234641580243496,
"learning_rate": 5e-06,
"loss": 0.3349,
"step": 51
},
{
"epoch": 0.46846846846846846,
"grad_norm": 0.30882782510454476,
"learning_rate": 5e-06,
"loss": 0.3307,
"step": 52
},
{
"epoch": 0.4774774774774775,
"grad_norm": 0.2657310651267706,
"learning_rate": 5e-06,
"loss": 0.3246,
"step": 53
},
{
"epoch": 0.4864864864864865,
"grad_norm": 0.2876695765716273,
"learning_rate": 5e-06,
"loss": 0.336,
"step": 54
},
{
"epoch": 0.4954954954954955,
"grad_norm": 0.29656571676225046,
"learning_rate": 5e-06,
"loss": 0.3428,
"step": 55
},
{
"epoch": 0.5045045045045045,
"grad_norm": 0.25789947550982967,
"learning_rate": 5e-06,
"loss": 0.3035,
"step": 56
},
{
"epoch": 0.5135135135135135,
"grad_norm": 0.3359664317488606,
"learning_rate": 5e-06,
"loss": 0.3221,
"step": 57
},
{
"epoch": 0.5225225225225225,
"grad_norm": 0.26901646941539337,
"learning_rate": 5e-06,
"loss": 0.3061,
"step": 58
},
{
"epoch": 0.5315315315315315,
"grad_norm": 0.26500112714488566,
"learning_rate": 5e-06,
"loss": 0.3203,
"step": 59
},
{
"epoch": 0.5405405405405406,
"grad_norm": 0.2614586643859284,
"learning_rate": 5e-06,
"loss": 0.3068,
"step": 60
},
{
"epoch": 0.5495495495495496,
"grad_norm": 0.28994271054547277,
"learning_rate": 5e-06,
"loss": 0.3568,
"step": 61
},
{
"epoch": 0.5585585585585585,
"grad_norm": 0.3372155822417667,
"learning_rate": 5e-06,
"loss": 0.3559,
"step": 62
},
{
"epoch": 0.5675675675675675,
"grad_norm": 0.30224128387648297,
"learning_rate": 5e-06,
"loss": 0.3069,
"step": 63
},
{
"epoch": 0.5765765765765766,
"grad_norm": 0.3130672270163632,
"learning_rate": 5e-06,
"loss": 0.3513,
"step": 64
},
{
"epoch": 0.5855855855855856,
"grad_norm": 0.3065414445284105,
"learning_rate": 5e-06,
"loss": 0.3194,
"step": 65
},
{
"epoch": 0.5945945945945946,
"grad_norm": 0.29075353592758474,
"learning_rate": 5e-06,
"loss": 0.364,
"step": 66
},
{
"epoch": 0.6036036036036037,
"grad_norm": 0.28085597006714386,
"learning_rate": 5e-06,
"loss": 0.3626,
"step": 67
},
{
"epoch": 0.6126126126126126,
"grad_norm": 0.30828909246343983,
"learning_rate": 5e-06,
"loss": 0.3278,
"step": 68
},
{
"epoch": 0.6216216216216216,
"grad_norm": 0.30901462421223835,
"learning_rate": 5e-06,
"loss": 0.3338,
"step": 69
},
{
"epoch": 0.6306306306306306,
"grad_norm": 0.3316361212286006,
"learning_rate": 5e-06,
"loss": 0.3444,
"step": 70
},
{
"epoch": 0.6396396396396397,
"grad_norm": 0.26217545165384337,
"learning_rate": 5e-06,
"loss": 0.3226,
"step": 71
},
{
"epoch": 0.6486486486486487,
"grad_norm": 0.2563886493400457,
"learning_rate": 5e-06,
"loss": 0.3409,
"step": 72
},
{
"epoch": 0.6576576576576577,
"grad_norm": 0.2962337946705661,
"learning_rate": 5e-06,
"loss": 0.3196,
"step": 73
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.2542256020612804,
"learning_rate": 5e-06,
"loss": 0.3281,
"step": 74
},
{
"epoch": 0.6756756756756757,
"grad_norm": 0.32938420923589096,
"learning_rate": 5e-06,
"loss": 0.3185,
"step": 75
},
{
"epoch": 0.6846846846846847,
"grad_norm": 0.33155654515742616,
"learning_rate": 5e-06,
"loss": 0.3153,
"step": 76
},
{
"epoch": 0.6936936936936937,
"grad_norm": 0.25618184255532905,
"learning_rate": 5e-06,
"loss": 0.3109,
"step": 77
},
{
"epoch": 0.7027027027027027,
"grad_norm": 0.27091352477500336,
"learning_rate": 5e-06,
"loss": 0.3371,
"step": 78
},
{
"epoch": 0.7117117117117117,
"grad_norm": 0.270383658268609,
"learning_rate": 5e-06,
"loss": 0.3325,
"step": 79
},
{
"epoch": 0.7207207207207207,
"grad_norm": 0.2525642887483178,
"learning_rate": 5e-06,
"loss": 0.3288,
"step": 80
},
{
"epoch": 0.7297297297297297,
"grad_norm": 0.3027811916633369,
"learning_rate": 5e-06,
"loss": 0.3472,
"step": 81
},
{
"epoch": 0.7387387387387387,
"grad_norm": 0.3506129591935139,
"learning_rate": 5e-06,
"loss": 0.3383,
"step": 82
},
{
"epoch": 0.7477477477477478,
"grad_norm": 0.32710280320818547,
"learning_rate": 5e-06,
"loss": 0.3095,
"step": 83
},
{
"epoch": 0.7567567567567568,
"grad_norm": 0.28423266582526613,
"learning_rate": 5e-06,
"loss": 0.2909,
"step": 84
},
{
"epoch": 0.7657657657657657,
"grad_norm": 0.30514786428468144,
"learning_rate": 5e-06,
"loss": 0.3181,
"step": 85
},
{
"epoch": 0.7747747747747747,
"grad_norm": 0.3219013280475637,
"learning_rate": 5e-06,
"loss": 0.3421,
"step": 86
},
{
"epoch": 0.7837837837837838,
"grad_norm": 0.30765019613171724,
"learning_rate": 5e-06,
"loss": 0.339,
"step": 87
},
{
"epoch": 0.7927927927927928,
"grad_norm": 0.31363666903509363,
"learning_rate": 5e-06,
"loss": 0.32,
"step": 88
},
{
"epoch": 0.8018018018018018,
"grad_norm": 0.2802553985535834,
"learning_rate": 5e-06,
"loss": 0.3698,
"step": 89
},
{
"epoch": 0.8108108108108109,
"grad_norm": 0.29928509597489333,
"learning_rate": 5e-06,
"loss": 0.3465,
"step": 90
},
{
"epoch": 0.8198198198198198,
"grad_norm": 0.30368274583450106,
"learning_rate": 5e-06,
"loss": 0.321,
"step": 91
},
{
"epoch": 0.8288288288288288,
"grad_norm": 0.28901190479096217,
"learning_rate": 5e-06,
"loss": 0.3034,
"step": 92
},
{
"epoch": 0.8378378378378378,
"grad_norm": 0.27035852334114224,
"learning_rate": 5e-06,
"loss": 0.3379,
"step": 93
},
{
"epoch": 0.8468468468468469,
"grad_norm": 0.2757989755002078,
"learning_rate": 5e-06,
"loss": 0.335,
"step": 94
},
{
"epoch": 0.8558558558558559,
"grad_norm": 0.30063030136785046,
"learning_rate": 5e-06,
"loss": 0.3299,
"step": 95
},
{
"epoch": 0.8648648648648649,
"grad_norm": 0.3436429105109027,
"learning_rate": 5e-06,
"loss": 0.2906,
"step": 96
},
{
"epoch": 0.8738738738738738,
"grad_norm": 0.2995609579715489,
"learning_rate": 5e-06,
"loss": 0.317,
"step": 97
},
{
"epoch": 0.8828828828828829,
"grad_norm": 0.2860865100969785,
"learning_rate": 5e-06,
"loss": 0.3281,
"step": 98
},
{
"epoch": 0.8918918918918919,
"grad_norm": 0.29202094172851817,
"learning_rate": 5e-06,
"loss": 0.3225,
"step": 99
},
{
"epoch": 0.9009009009009009,
"grad_norm": 0.2931365896073913,
"learning_rate": 5e-06,
"loss": 0.3022,
"step": 100
},
{
"epoch": 0.9099099099099099,
"grad_norm": 0.30610410355543166,
"learning_rate": 5e-06,
"loss": 0.3287,
"step": 101
},
{
"epoch": 0.918918918918919,
"grad_norm": 0.27050744175601266,
"learning_rate": 5e-06,
"loss": 0.3004,
"step": 102
},
{
"epoch": 0.9279279279279279,
"grad_norm": 0.2530762314307683,
"learning_rate": 5e-06,
"loss": 0.3153,
"step": 103
},
{
"epoch": 0.9369369369369369,
"grad_norm": 0.2939696187606388,
"learning_rate": 5e-06,
"loss": 0.3039,
"step": 104
},
{
"epoch": 0.9459459459459459,
"grad_norm": 0.269725936200039,
"learning_rate": 5e-06,
"loss": 0.3028,
"step": 105
},
{
"epoch": 0.954954954954955,
"grad_norm": 0.32481323273559976,
"learning_rate": 5e-06,
"loss": 0.3328,
"step": 106
},
{
"epoch": 0.963963963963964,
"grad_norm": 0.3297388133110706,
"learning_rate": 5e-06,
"loss": 0.3303,
"step": 107
},
{
"epoch": 0.972972972972973,
"grad_norm": 0.3137683488542705,
"learning_rate": 5e-06,
"loss": 0.3317,
"step": 108
},
{
"epoch": 0.9819819819819819,
"grad_norm": 0.2724212797943338,
"learning_rate": 5e-06,
"loss": 0.3231,
"step": 109
},
{
"epoch": 0.990990990990991,
"grad_norm": 0.26974252035068974,
"learning_rate": 5e-06,
"loss": 0.3052,
"step": 110
},
{
"epoch": 1.0,
"grad_norm": 0.27546705234954955,
"learning_rate": 5e-06,
"loss": 0.2872,
"step": 111
},
{
"epoch": 1.009009009009009,
"grad_norm": 0.26532136740094475,
"learning_rate": 5e-06,
"loss": 0.2831,
"step": 112
},
{
"epoch": 1.018018018018018,
"grad_norm": 0.2847617719081207,
"learning_rate": 5e-06,
"loss": 0.3127,
"step": 113
},
{
"epoch": 1.027027027027027,
"grad_norm": 0.25187489870567525,
"learning_rate": 5e-06,
"loss": 0.3179,
"step": 114
},
{
"epoch": 1.0360360360360361,
"grad_norm": 0.2470210561590589,
"learning_rate": 5e-06,
"loss": 0.2888,
"step": 115
},
{
"epoch": 1.045045045045045,
"grad_norm": 0.2908873792372198,
"learning_rate": 5e-06,
"loss": 0.3172,
"step": 116
},
{
"epoch": 1.054054054054054,
"grad_norm": 0.2545755890884819,
"learning_rate": 5e-06,
"loss": 0.3044,
"step": 117
},
{
"epoch": 1.063063063063063,
"grad_norm": 0.2720375854552878,
"learning_rate": 5e-06,
"loss": 0.3171,
"step": 118
},
{
"epoch": 1.072072072072072,
"grad_norm": 0.2804009954248822,
"learning_rate": 5e-06,
"loss": 0.2903,
"step": 119
},
{
"epoch": 1.0810810810810811,
"grad_norm": 0.2584639986814767,
"learning_rate": 5e-06,
"loss": 0.2786,
"step": 120
},
{
"epoch": 1.09009009009009,
"grad_norm": 0.2523704924311713,
"learning_rate": 5e-06,
"loss": 0.3009,
"step": 121
},
{
"epoch": 1.0990990990990992,
"grad_norm": 0.27166739983138516,
"learning_rate": 5e-06,
"loss": 0.3144,
"step": 122
},
{
"epoch": 1.1081081081081081,
"grad_norm": 0.291934322919287,
"learning_rate": 5e-06,
"loss": 0.3199,
"step": 123
},
{
"epoch": 1.117117117117117,
"grad_norm": 0.2869424658137007,
"learning_rate": 5e-06,
"loss": 0.2768,
"step": 124
},
{
"epoch": 1.1261261261261262,
"grad_norm": 0.35542461802439873,
"learning_rate": 5e-06,
"loss": 0.2972,
"step": 125
},
{
"epoch": 1.135135135135135,
"grad_norm": 0.25765779721736737,
"learning_rate": 5e-06,
"loss": 0.2715,
"step": 126
},
{
"epoch": 1.1441441441441442,
"grad_norm": 0.2850720419420103,
"learning_rate": 5e-06,
"loss": 0.2861,
"step": 127
},
{
"epoch": 1.1531531531531531,
"grad_norm": 0.2869267701696132,
"learning_rate": 5e-06,
"loss": 0.2797,
"step": 128
},
{
"epoch": 1.1621621621621623,
"grad_norm": 0.27437916265446266,
"learning_rate": 5e-06,
"loss": 0.284,
"step": 129
},
{
"epoch": 1.1711711711711712,
"grad_norm": 0.26640743341471523,
"learning_rate": 5e-06,
"loss": 0.282,
"step": 130
},
{
"epoch": 1.1801801801801801,
"grad_norm": 0.2600732173679119,
"learning_rate": 5e-06,
"loss": 0.2919,
"step": 131
},
{
"epoch": 1.1891891891891893,
"grad_norm": 0.2665092682109021,
"learning_rate": 5e-06,
"loss": 0.3143,
"step": 132
},
{
"epoch": 1.1981981981981982,
"grad_norm": 0.24683974895824953,
"learning_rate": 5e-06,
"loss": 0.2987,
"step": 133
},
{
"epoch": 1.2072072072072073,
"grad_norm": 0.2908036694917544,
"learning_rate": 5e-06,
"loss": 0.3158,
"step": 134
},
{
"epoch": 1.2162162162162162,
"grad_norm": 0.2945953899064198,
"learning_rate": 5e-06,
"loss": 0.3152,
"step": 135
},
{
"epoch": 1.2252252252252251,
"grad_norm": 0.2616231868963709,
"learning_rate": 5e-06,
"loss": 0.3152,
"step": 136
},
{
"epoch": 1.2342342342342343,
"grad_norm": 0.27650089751312973,
"learning_rate": 5e-06,
"loss": 0.3029,
"step": 137
},
{
"epoch": 1.2432432432432432,
"grad_norm": 0.2631481660529609,
"learning_rate": 5e-06,
"loss": 0.3084,
"step": 138
},
{
"epoch": 1.2522522522522523,
"grad_norm": 0.28830473220819297,
"learning_rate": 5e-06,
"loss": 0.3251,
"step": 139
},
{
"epoch": 1.2612612612612613,
"grad_norm": 0.3062303048487267,
"learning_rate": 5e-06,
"loss": 0.3093,
"step": 140
},
{
"epoch": 1.2702702702702702,
"grad_norm": 0.3066815320224598,
"learning_rate": 5e-06,
"loss": 0.2881,
"step": 141
},
{
"epoch": 1.2792792792792793,
"grad_norm": 0.29129920876550947,
"learning_rate": 5e-06,
"loss": 0.282,
"step": 142
},
{
"epoch": 1.2882882882882882,
"grad_norm": 0.2895564905632834,
"learning_rate": 5e-06,
"loss": 0.3076,
"step": 143
},
{
"epoch": 1.2972972972972974,
"grad_norm": 0.25687914463290057,
"learning_rate": 5e-06,
"loss": 0.283,
"step": 144
},
{
"epoch": 1.3063063063063063,
"grad_norm": 0.2543976032045274,
"learning_rate": 5e-06,
"loss": 0.2987,
"step": 145
},
{
"epoch": 1.3153153153153152,
"grad_norm": 0.27423309545031693,
"learning_rate": 5e-06,
"loss": 0.2981,
"step": 146
},
{
"epoch": 1.3243243243243243,
"grad_norm": 0.3127504643012831,
"learning_rate": 5e-06,
"loss": 0.3091,
"step": 147
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.2738777266583336,
"learning_rate": 5e-06,
"loss": 0.2577,
"step": 148
},
{
"epoch": 1.3423423423423424,
"grad_norm": 0.2669333852747903,
"learning_rate": 5e-06,
"loss": 0.2855,
"step": 149
},
{
"epoch": 1.3513513513513513,
"grad_norm": 0.26761386479699967,
"learning_rate": 5e-06,
"loss": 0.3019,
"step": 150
},
{
"epoch": 1.3603603603603602,
"grad_norm": 0.25789802423884284,
"learning_rate": 5e-06,
"loss": 0.3107,
"step": 151
},
{
"epoch": 1.3693693693693694,
"grad_norm": 0.27940713368034126,
"learning_rate": 5e-06,
"loss": 0.2843,
"step": 152
},
{
"epoch": 1.3783783783783785,
"grad_norm": 0.277366156708692,
"learning_rate": 5e-06,
"loss": 0.2799,
"step": 153
},
{
"epoch": 1.3873873873873874,
"grad_norm": 0.2607843059312788,
"learning_rate": 5e-06,
"loss": 0.292,
"step": 154
},
{
"epoch": 1.3963963963963963,
"grad_norm": 0.2649281612489507,
"learning_rate": 5e-06,
"loss": 0.3134,
"step": 155
},
{
"epoch": 1.4054054054054055,
"grad_norm": 0.27271972468771527,
"learning_rate": 5e-06,
"loss": 0.2972,
"step": 156
},
{
"epoch": 1.4144144144144144,
"grad_norm": 0.26207901754212165,
"learning_rate": 5e-06,
"loss": 0.2822,
"step": 157
},
{
"epoch": 1.4234234234234235,
"grad_norm": 0.2641717963089793,
"learning_rate": 5e-06,
"loss": 0.2971,
"step": 158
},
{
"epoch": 1.4324324324324325,
"grad_norm": 0.2579842614638958,
"learning_rate": 5e-06,
"loss": 0.3024,
"step": 159
},
{
"epoch": 1.4414414414414414,
"grad_norm": 0.2870255938899811,
"learning_rate": 5e-06,
"loss": 0.2885,
"step": 160
},
{
"epoch": 1.4504504504504505,
"grad_norm": 0.2777224839264993,
"learning_rate": 5e-06,
"loss": 0.2892,
"step": 161
},
{
"epoch": 1.4594594594594594,
"grad_norm": 0.27625106290913043,
"learning_rate": 5e-06,
"loss": 0.2805,
"step": 162
},
{
"epoch": 1.4684684684684686,
"grad_norm": 0.2700016737510603,
"learning_rate": 5e-06,
"loss": 0.2992,
"step": 163
},
{
"epoch": 1.4774774774774775,
"grad_norm": 0.25372514988722056,
"learning_rate": 5e-06,
"loss": 0.2972,
"step": 164
},
{
"epoch": 1.4864864864864864,
"grad_norm": 0.28782834487825465,
"learning_rate": 5e-06,
"loss": 0.3018,
"step": 165
},
{
"epoch": 1.4954954954954955,
"grad_norm": 0.27036226357763354,
"learning_rate": 5e-06,
"loss": 0.2968,
"step": 166
},
{
"epoch": 1.5045045045045045,
"grad_norm": 0.24997568182394178,
"learning_rate": 5e-06,
"loss": 0.2482,
"step": 167
},
{
"epoch": 1.5135135135135136,
"grad_norm": 0.28025540658752757,
"learning_rate": 5e-06,
"loss": 0.3314,
"step": 168
},
{
"epoch": 1.5225225225225225,
"grad_norm": 0.25563343479526396,
"learning_rate": 5e-06,
"loss": 0.3163,
"step": 169
},
{
"epoch": 1.5315315315315314,
"grad_norm": 0.3556162754506623,
"learning_rate": 5e-06,
"loss": 0.2925,
"step": 170
},
{
"epoch": 1.5405405405405406,
"grad_norm": 0.27599016482238853,
"learning_rate": 5e-06,
"loss": 0.2838,
"step": 171
},
{
"epoch": 1.5495495495495497,
"grad_norm": 0.272343971725021,
"learning_rate": 5e-06,
"loss": 0.3088,
"step": 172
},
{
"epoch": 1.5585585585585586,
"grad_norm": 0.28693003610171597,
"learning_rate": 5e-06,
"loss": 0.2921,
"step": 173
},
{
"epoch": 1.5675675675675675,
"grad_norm": 0.2955327518594707,
"learning_rate": 5e-06,
"loss": 0.2777,
"step": 174
},
{
"epoch": 1.5765765765765765,
"grad_norm": 0.27961760151449894,
"learning_rate": 5e-06,
"loss": 0.2838,
"step": 175
},
{
"epoch": 1.5855855855855856,
"grad_norm": 0.24665431850909808,
"learning_rate": 5e-06,
"loss": 0.2781,
"step": 176
},
{
"epoch": 1.5945945945945947,
"grad_norm": 0.26426261640553667,
"learning_rate": 5e-06,
"loss": 0.2816,
"step": 177
},
{
"epoch": 1.6036036036036037,
"grad_norm": 0.2711333903704824,
"learning_rate": 5e-06,
"loss": 0.3142,
"step": 178
},
{
"epoch": 1.6126126126126126,
"grad_norm": 0.2722379287245898,
"learning_rate": 5e-06,
"loss": 0.2816,
"step": 179
},
{
"epoch": 1.6216216216216215,
"grad_norm": 0.3012330875667607,
"learning_rate": 5e-06,
"loss": 0.3263,
"step": 180
},
{
"epoch": 1.6306306306306306,
"grad_norm": 0.2669108739090265,
"learning_rate": 5e-06,
"loss": 0.2952,
"step": 181
},
{
"epoch": 1.6396396396396398,
"grad_norm": 0.2748579289599078,
"learning_rate": 5e-06,
"loss": 0.2823,
"step": 182
},
{
"epoch": 1.6486486486486487,
"grad_norm": 0.29837425745633833,
"learning_rate": 5e-06,
"loss": 0.3038,
"step": 183
},
{
"epoch": 1.6576576576576576,
"grad_norm": 0.3305979404285009,
"learning_rate": 5e-06,
"loss": 0.3017,
"step": 184
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.26365462645864157,
"learning_rate": 5e-06,
"loss": 0.2952,
"step": 185
},
{
"epoch": 1.6756756756756757,
"grad_norm": 0.27117354048602127,
"learning_rate": 5e-06,
"loss": 0.2713,
"step": 186
},
{
"epoch": 1.6846846846846848,
"grad_norm": 0.2618109082938301,
"learning_rate": 5e-06,
"loss": 0.3259,
"step": 187
},
{
"epoch": 1.6936936936936937,
"grad_norm": 0.24890174240606217,
"learning_rate": 5e-06,
"loss": 0.2864,
"step": 188
},
{
"epoch": 1.7027027027027026,
"grad_norm": 0.28948368345439884,
"learning_rate": 5e-06,
"loss": 0.3242,
"step": 189
},
{
"epoch": 1.7117117117117115,
"grad_norm": 0.2659473815766033,
"learning_rate": 5e-06,
"loss": 0.2928,
"step": 190
},
{
"epoch": 1.7207207207207207,
"grad_norm": 0.26435921312812555,
"learning_rate": 5e-06,
"loss": 0.2747,
"step": 191
},
{
"epoch": 1.7297297297297298,
"grad_norm": 0.2834566804404197,
"learning_rate": 5e-06,
"loss": 0.3373,
"step": 192
},
{
"epoch": 1.7387387387387387,
"grad_norm": 0.26226690378932954,
"learning_rate": 5e-06,
"loss": 0.2713,
"step": 193
},
{
"epoch": 1.7477477477477477,
"grad_norm": 0.2574908549961044,
"learning_rate": 5e-06,
"loss": 0.2949,
"step": 194
},
{
"epoch": 1.7567567567567568,
"grad_norm": 0.2670216430713444,
"learning_rate": 5e-06,
"loss": 0.3105,
"step": 195
},
{
"epoch": 1.7657657657657657,
"grad_norm": 0.2644549565961117,
"learning_rate": 5e-06,
"loss": 0.2974,
"step": 196
},
{
"epoch": 1.7747747747747749,
"grad_norm": 0.2754975911578592,
"learning_rate": 5e-06,
"loss": 0.3019,
"step": 197
},
{
"epoch": 1.7837837837837838,
"grad_norm": 0.28446592391114817,
"learning_rate": 5e-06,
"loss": 0.3148,
"step": 198
},
{
"epoch": 1.7927927927927927,
"grad_norm": 0.28893386362511947,
"learning_rate": 5e-06,
"loss": 0.3189,
"step": 199
},
{
"epoch": 1.8018018018018018,
"grad_norm": 0.2869246676669029,
"learning_rate": 5e-06,
"loss": 0.3015,
"step": 200
},
{
"epoch": 1.810810810810811,
"grad_norm": 0.2847178633594474,
"learning_rate": 5e-06,
"loss": 0.2628,
"step": 201
},
{
"epoch": 1.8198198198198199,
"grad_norm": 0.2946725850660284,
"learning_rate": 5e-06,
"loss": 0.2768,
"step": 202
},
{
"epoch": 1.8288288288288288,
"grad_norm": 0.29608299277278,
"learning_rate": 5e-06,
"loss": 0.3171,
"step": 203
},
{
"epoch": 1.8378378378378377,
"grad_norm": 0.28628382246998885,
"learning_rate": 5e-06,
"loss": 0.3096,
"step": 204
},
{
"epoch": 1.8468468468468469,
"grad_norm": 0.2660371973699119,
"learning_rate": 5e-06,
"loss": 0.2685,
"step": 205
},
{
"epoch": 1.855855855855856,
"grad_norm": 0.2514264016055165,
"learning_rate": 5e-06,
"loss": 0.2622,
"step": 206
},
{
"epoch": 1.864864864864865,
"grad_norm": 0.2675623714158383,
"learning_rate": 5e-06,
"loss": 0.3324,
"step": 207
},
{
"epoch": 1.8738738738738738,
"grad_norm": 0.2817065371989752,
"learning_rate": 5e-06,
"loss": 0.2926,
"step": 208
},
{
"epoch": 1.8828828828828827,
"grad_norm": 0.24376840027264843,
"learning_rate": 5e-06,
"loss": 0.2695,
"step": 209
},
{
"epoch": 1.8918918918918919,
"grad_norm": 0.2679237524654036,
"learning_rate": 5e-06,
"loss": 0.2606,
"step": 210
},
{
"epoch": 1.900900900900901,
"grad_norm": 0.2593077892544588,
"learning_rate": 5e-06,
"loss": 0.2781,
"step": 211
},
{
"epoch": 1.90990990990991,
"grad_norm": 0.2555343741606999,
"learning_rate": 5e-06,
"loss": 0.2616,
"step": 212
},
{
"epoch": 1.9189189189189189,
"grad_norm": 0.27065363914180135,
"learning_rate": 5e-06,
"loss": 0.309,
"step": 213
},
{
"epoch": 1.9279279279279278,
"grad_norm": 0.29950662348843465,
"learning_rate": 5e-06,
"loss": 0.2953,
"step": 214
},
{
"epoch": 1.936936936936937,
"grad_norm": 0.30392398016557,
"learning_rate": 5e-06,
"loss": 0.3302,
"step": 215
},
{
"epoch": 1.945945945945946,
"grad_norm": 0.2688781676455933,
"learning_rate": 5e-06,
"loss": 0.2946,
"step": 216
},
{
"epoch": 1.954954954954955,
"grad_norm": 0.27334249580678227,
"learning_rate": 5e-06,
"loss": 0.3169,
"step": 217
},
{
"epoch": 1.9639639639639639,
"grad_norm": 0.2637661232011851,
"learning_rate": 5e-06,
"loss": 0.2923,
"step": 218
},
{
"epoch": 1.972972972972973,
"grad_norm": 0.24845919128888916,
"learning_rate": 5e-06,
"loss": 0.2956,
"step": 219
},
{
"epoch": 1.981981981981982,
"grad_norm": 0.2677476120892863,
"learning_rate": 5e-06,
"loss": 0.2725,
"step": 220
},
{
"epoch": 1.990990990990991,
"grad_norm": 0.27245457118100547,
"learning_rate": 5e-06,
"loss": 0.304,
"step": 221
},
{
"epoch": 2.0,
"grad_norm": 0.2632364290696338,
"learning_rate": 5e-06,
"loss": 0.2759,
"step": 222
},
{
"epoch": 2.009009009009009,
"grad_norm": 0.29524131111947416,
"learning_rate": 5e-06,
"loss": 0.2467,
"step": 223
},
{
"epoch": 2.018018018018018,
"grad_norm": 0.26959444826517864,
"learning_rate": 5e-06,
"loss": 0.2509,
"step": 224
},
{
"epoch": 2.027027027027027,
"grad_norm": 0.24776989679141162,
"learning_rate": 5e-06,
"loss": 0.2647,
"step": 225
},
{
"epoch": 2.036036036036036,
"grad_norm": 0.24922491602278132,
"learning_rate": 5e-06,
"loss": 0.2734,
"step": 226
},
{
"epoch": 2.045045045045045,
"grad_norm": 0.2637011140567836,
"learning_rate": 5e-06,
"loss": 0.2538,
"step": 227
},
{
"epoch": 2.054054054054054,
"grad_norm": 0.24677968833597697,
"learning_rate": 5e-06,
"loss": 0.2569,
"step": 228
},
{
"epoch": 2.063063063063063,
"grad_norm": 0.25749179244984177,
"learning_rate": 5e-06,
"loss": 0.266,
"step": 229
},
{
"epoch": 2.0720720720720722,
"grad_norm": 0.2704364348984915,
"learning_rate": 5e-06,
"loss": 0.2645,
"step": 230
},
{
"epoch": 2.081081081081081,
"grad_norm": 0.2848341811917101,
"learning_rate": 5e-06,
"loss": 0.258,
"step": 231
},
{
"epoch": 2.09009009009009,
"grad_norm": 0.2539455237645273,
"learning_rate": 5e-06,
"loss": 0.2648,
"step": 232
},
{
"epoch": 2.099099099099099,
"grad_norm": 0.2534894136461773,
"learning_rate": 5e-06,
"loss": 0.2611,
"step": 233
},
{
"epoch": 2.108108108108108,
"grad_norm": 0.2666435185167066,
"learning_rate": 5e-06,
"loss": 0.2671,
"step": 234
},
{
"epoch": 2.1171171171171173,
"grad_norm": 0.275032039682747,
"learning_rate": 5e-06,
"loss": 0.2807,
"step": 235
},
{
"epoch": 2.126126126126126,
"grad_norm": 0.24537895004936466,
"learning_rate": 5e-06,
"loss": 0.2777,
"step": 236
},
{
"epoch": 2.135135135135135,
"grad_norm": 0.29459998669694115,
"learning_rate": 5e-06,
"loss": 0.2782,
"step": 237
},
{
"epoch": 2.144144144144144,
"grad_norm": 0.2727554788191977,
"learning_rate": 5e-06,
"loss": 0.2687,
"step": 238
},
{
"epoch": 2.153153153153153,
"grad_norm": 0.30880501847599995,
"learning_rate": 5e-06,
"loss": 0.2878,
"step": 239
},
{
"epoch": 2.1621621621621623,
"grad_norm": 0.2886633976684916,
"learning_rate": 5e-06,
"loss": 0.267,
"step": 240
},
{
"epoch": 2.171171171171171,
"grad_norm": 0.2597628174067978,
"learning_rate": 5e-06,
"loss": 0.2648,
"step": 241
},
{
"epoch": 2.18018018018018,
"grad_norm": 0.2534324931692372,
"learning_rate": 5e-06,
"loss": 0.2981,
"step": 242
},
{
"epoch": 2.189189189189189,
"grad_norm": 0.2563993838591747,
"learning_rate": 5e-06,
"loss": 0.2487,
"step": 243
},
{
"epoch": 2.1981981981981984,
"grad_norm": 0.2852726219398302,
"learning_rate": 5e-06,
"loss": 0.2543,
"step": 244
},
{
"epoch": 2.2072072072072073,
"grad_norm": 0.30478195170068134,
"learning_rate": 5e-06,
"loss": 0.2562,
"step": 245
},
{
"epoch": 2.2162162162162162,
"grad_norm": 0.24772685929517294,
"learning_rate": 5e-06,
"loss": 0.2869,
"step": 246
},
{
"epoch": 2.225225225225225,
"grad_norm": 0.26428977786941277,
"learning_rate": 5e-06,
"loss": 0.2709,
"step": 247
},
{
"epoch": 2.234234234234234,
"grad_norm": 0.2447098843485426,
"learning_rate": 5e-06,
"loss": 0.2241,
"step": 248
},
{
"epoch": 2.2432432432432434,
"grad_norm": 0.2841804786817898,
"learning_rate": 5e-06,
"loss": 0.2398,
"step": 249
},
{
"epoch": 2.2522522522522523,
"grad_norm": 0.2837413945636495,
"learning_rate": 5e-06,
"loss": 0.2755,
"step": 250
},
{
"epoch": 2.2612612612612613,
"grad_norm": 0.27688677145182117,
"learning_rate": 5e-06,
"loss": 0.2581,
"step": 251
},
{
"epoch": 2.27027027027027,
"grad_norm": 0.2524013812037196,
"learning_rate": 5e-06,
"loss": 0.2447,
"step": 252
},
{
"epoch": 2.279279279279279,
"grad_norm": 0.25708866849265744,
"learning_rate": 5e-06,
"loss": 0.2629,
"step": 253
},
{
"epoch": 2.2882882882882885,
"grad_norm": 0.31089756790372536,
"learning_rate": 5e-06,
"loss": 0.262,
"step": 254
},
{
"epoch": 2.2972972972972974,
"grad_norm": 0.2580437334513352,
"learning_rate": 5e-06,
"loss": 0.2672,
"step": 255
},
{
"epoch": 2.3063063063063063,
"grad_norm": 0.25589033140205797,
"learning_rate": 5e-06,
"loss": 0.2854,
"step": 256
},
{
"epoch": 2.315315315315315,
"grad_norm": 0.2851188761111017,
"learning_rate": 5e-06,
"loss": 0.2847,
"step": 257
},
{
"epoch": 2.3243243243243246,
"grad_norm": 0.2742352435214708,
"learning_rate": 5e-06,
"loss": 0.2863,
"step": 258
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.25574343614682743,
"learning_rate": 5e-06,
"loss": 0.2744,
"step": 259
},
{
"epoch": 2.3423423423423424,
"grad_norm": 0.2704501372387818,
"learning_rate": 5e-06,
"loss": 0.2356,
"step": 260
},
{
"epoch": 2.3513513513513513,
"grad_norm": 0.2694883625074875,
"learning_rate": 5e-06,
"loss": 0.2845,
"step": 261
},
{
"epoch": 2.3603603603603602,
"grad_norm": 0.2749897171746042,
"learning_rate": 5e-06,
"loss": 0.2745,
"step": 262
},
{
"epoch": 2.3693693693693696,
"grad_norm": 0.33678826387641014,
"learning_rate": 5e-06,
"loss": 0.3088,
"step": 263
},
{
"epoch": 2.3783783783783785,
"grad_norm": 0.2773165283746789,
"learning_rate": 5e-06,
"loss": 0.2946,
"step": 264
},
{
"epoch": 2.3873873873873874,
"grad_norm": 0.31677913584086903,
"learning_rate": 5e-06,
"loss": 0.3179,
"step": 265
},
{
"epoch": 2.3963963963963963,
"grad_norm": 0.2563051452749462,
"learning_rate": 5e-06,
"loss": 0.2625,
"step": 266
},
{
"epoch": 2.4054054054054053,
"grad_norm": 0.321688693489085,
"learning_rate": 5e-06,
"loss": 0.2671,
"step": 267
},
{
"epoch": 2.4144144144144146,
"grad_norm": 0.26634437339972133,
"learning_rate": 5e-06,
"loss": 0.2613,
"step": 268
},
{
"epoch": 2.4234234234234235,
"grad_norm": 0.27171211584580457,
"learning_rate": 5e-06,
"loss": 0.2628,
"step": 269
},
{
"epoch": 2.4324324324324325,
"grad_norm": 0.2555430715005437,
"learning_rate": 5e-06,
"loss": 0.2687,
"step": 270
},
{
"epoch": 2.4414414414414414,
"grad_norm": 0.24255848197003171,
"learning_rate": 5e-06,
"loss": 0.2941,
"step": 271
},
{
"epoch": 2.4504504504504503,
"grad_norm": 0.29538238957980967,
"learning_rate": 5e-06,
"loss": 0.2777,
"step": 272
},
{
"epoch": 2.4594594594594597,
"grad_norm": 0.2876545631402078,
"learning_rate": 5e-06,
"loss": 0.2764,
"step": 273
},
{
"epoch": 2.4684684684684686,
"grad_norm": 0.2773762933353327,
"learning_rate": 5e-06,
"loss": 0.2834,
"step": 274
},
{
"epoch": 2.4774774774774775,
"grad_norm": 0.25275190194965114,
"learning_rate": 5e-06,
"loss": 0.2625,
"step": 275
},
{
"epoch": 2.4864864864864864,
"grad_norm": 0.30548139692249815,
"learning_rate": 5e-06,
"loss": 0.264,
"step": 276
},
{
"epoch": 2.4954954954954953,
"grad_norm": 0.2857116539220258,
"learning_rate": 5e-06,
"loss": 0.2663,
"step": 277
},
{
"epoch": 2.5045045045045047,
"grad_norm": 0.27127459034653845,
"learning_rate": 5e-06,
"loss": 0.289,
"step": 278
},
{
"epoch": 2.5135135135135136,
"grad_norm": 0.29403524162565264,
"learning_rate": 5e-06,
"loss": 0.2665,
"step": 279
},
{
"epoch": 2.5225225225225225,
"grad_norm": 0.2982604039719257,
"learning_rate": 5e-06,
"loss": 0.2635,
"step": 280
},
{
"epoch": 2.5315315315315314,
"grad_norm": 0.25776587175299304,
"learning_rate": 5e-06,
"loss": 0.2592,
"step": 281
},
{
"epoch": 2.5405405405405403,
"grad_norm": 0.2646598986862087,
"learning_rate": 5e-06,
"loss": 0.2579,
"step": 282
},
{
"epoch": 2.5495495495495497,
"grad_norm": 0.24717949544087905,
"learning_rate": 5e-06,
"loss": 0.2544,
"step": 283
},
{
"epoch": 2.5585585585585586,
"grad_norm": 0.2657887766041429,
"learning_rate": 5e-06,
"loss": 0.2656,
"step": 284
},
{
"epoch": 2.5675675675675675,
"grad_norm": 0.27748457946008864,
"learning_rate": 5e-06,
"loss": 0.2675,
"step": 285
},
{
"epoch": 2.5765765765765765,
"grad_norm": 0.25089374600320746,
"learning_rate": 5e-06,
"loss": 0.2816,
"step": 286
},
{
"epoch": 2.5855855855855854,
"grad_norm": 0.28897866413916584,
"learning_rate": 5e-06,
"loss": 0.2589,
"step": 287
},
{
"epoch": 2.5945945945945947,
"grad_norm": 0.26235423487495346,
"learning_rate": 5e-06,
"loss": 0.271,
"step": 288
},
{
"epoch": 2.6036036036036037,
"grad_norm": 0.29773828111895406,
"learning_rate": 5e-06,
"loss": 0.2884,
"step": 289
},
{
"epoch": 2.6126126126126126,
"grad_norm": 0.2732062490555635,
"learning_rate": 5e-06,
"loss": 0.297,
"step": 290
},
{
"epoch": 2.6216216216216215,
"grad_norm": 0.28269145506341653,
"learning_rate": 5e-06,
"loss": 0.2794,
"step": 291
},
{
"epoch": 2.6306306306306304,
"grad_norm": 0.2592351362804753,
"learning_rate": 5e-06,
"loss": 0.2653,
"step": 292
},
{
"epoch": 2.6396396396396398,
"grad_norm": 0.27363184791488976,
"learning_rate": 5e-06,
"loss": 0.2659,
"step": 293
},
{
"epoch": 2.6486486486486487,
"grad_norm": 0.2687283362268144,
"learning_rate": 5e-06,
"loss": 0.2881,
"step": 294
},
{
"epoch": 2.6576576576576576,
"grad_norm": 0.2669999794761192,
"learning_rate": 5e-06,
"loss": 0.2658,
"step": 295
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.2584313873251436,
"learning_rate": 5e-06,
"loss": 0.2597,
"step": 296
},
{
"epoch": 2.6756756756756754,
"grad_norm": 0.2870412914872632,
"learning_rate": 5e-06,
"loss": 0.273,
"step": 297
},
{
"epoch": 2.684684684684685,
"grad_norm": 0.2565405158611234,
"learning_rate": 5e-06,
"loss": 0.2471,
"step": 298
},
{
"epoch": 2.6936936936936937,
"grad_norm": 0.2718920473228364,
"learning_rate": 5e-06,
"loss": 0.2556,
"step": 299
},
{
"epoch": 2.7027027027027026,
"grad_norm": 0.2732398668856954,
"learning_rate": 5e-06,
"loss": 0.2729,
"step": 300
},
{
"epoch": 2.7117117117117115,
"grad_norm": 0.25213076888264274,
"learning_rate": 5e-06,
"loss": 0.2659,
"step": 301
},
{
"epoch": 2.7207207207207205,
"grad_norm": 0.25342262780535696,
"learning_rate": 5e-06,
"loss": 0.2791,
"step": 302
},
{
"epoch": 2.72972972972973,
"grad_norm": 0.2929513672092119,
"learning_rate": 5e-06,
"loss": 0.2714,
"step": 303
},
{
"epoch": 2.7387387387387387,
"grad_norm": 0.27482309634629043,
"learning_rate": 5e-06,
"loss": 0.2646,
"step": 304
},
{
"epoch": 2.7477477477477477,
"grad_norm": 0.26495695016553,
"learning_rate": 5e-06,
"loss": 0.2655,
"step": 305
},
{
"epoch": 2.756756756756757,
"grad_norm": 0.2751450071843517,
"learning_rate": 5e-06,
"loss": 0.282,
"step": 306
},
{
"epoch": 2.7657657657657655,
"grad_norm": 0.2492074837362159,
"learning_rate": 5e-06,
"loss": 0.2728,
"step": 307
},
{
"epoch": 2.774774774774775,
"grad_norm": 0.24588259514355568,
"learning_rate": 5e-06,
"loss": 0.2506,
"step": 308
},
{
"epoch": 2.7837837837837838,
"grad_norm": 0.290865691950273,
"learning_rate": 5e-06,
"loss": 0.3019,
"step": 309
},
{
"epoch": 2.7927927927927927,
"grad_norm": 0.24649105252907824,
"learning_rate": 5e-06,
"loss": 0.3028,
"step": 310
},
{
"epoch": 2.801801801801802,
"grad_norm": 0.24865219694730992,
"learning_rate": 5e-06,
"loss": 0.2647,
"step": 311
},
{
"epoch": 2.810810810810811,
"grad_norm": 0.2641273618850612,
"learning_rate": 5e-06,
"loss": 0.2743,
"step": 312
},
{
"epoch": 2.81981981981982,
"grad_norm": 0.27036448999028867,
"learning_rate": 5e-06,
"loss": 0.2483,
"step": 313
},
{
"epoch": 2.828828828828829,
"grad_norm": 0.277820288498933,
"learning_rate": 5e-06,
"loss": 0.2478,
"step": 314
},
{
"epoch": 2.8378378378378377,
"grad_norm": 0.25834412495274456,
"learning_rate": 5e-06,
"loss": 0.2395,
"step": 315
},
{
"epoch": 2.846846846846847,
"grad_norm": 0.25827263911198917,
"learning_rate": 5e-06,
"loss": 0.222,
"step": 316
},
{
"epoch": 2.855855855855856,
"grad_norm": 0.28475747286608616,
"learning_rate": 5e-06,
"loss": 0.2747,
"step": 317
},
{
"epoch": 2.864864864864865,
"grad_norm": 0.25037323222188695,
"learning_rate": 5e-06,
"loss": 0.2689,
"step": 318
},
{
"epoch": 2.873873873873874,
"grad_norm": 0.2652972773806203,
"learning_rate": 5e-06,
"loss": 0.2477,
"step": 319
},
{
"epoch": 2.8828828828828827,
"grad_norm": 0.26279014403702605,
"learning_rate": 5e-06,
"loss": 0.2734,
"step": 320
},
{
"epoch": 2.891891891891892,
"grad_norm": 0.2854042712916503,
"learning_rate": 5e-06,
"loss": 0.2679,
"step": 321
},
{
"epoch": 2.900900900900901,
"grad_norm": 0.26077805779165003,
"learning_rate": 5e-06,
"loss": 0.2608,
"step": 322
},
{
"epoch": 2.90990990990991,
"grad_norm": 0.255112093170312,
"learning_rate": 5e-06,
"loss": 0.2695,
"step": 323
},
{
"epoch": 2.918918918918919,
"grad_norm": 0.26211588620202336,
"learning_rate": 5e-06,
"loss": 0.2424,
"step": 324
},
{
"epoch": 2.9279279279279278,
"grad_norm": 0.2685084403266774,
"learning_rate": 5e-06,
"loss": 0.235,
"step": 325
},
{
"epoch": 2.936936936936937,
"grad_norm": 0.27269803144536753,
"learning_rate": 5e-06,
"loss": 0.2759,
"step": 326
},
{
"epoch": 2.945945945945946,
"grad_norm": 0.26751393672770624,
"learning_rate": 5e-06,
"loss": 0.2564,
"step": 327
},
{
"epoch": 2.954954954954955,
"grad_norm": 0.2665543902683488,
"learning_rate": 5e-06,
"loss": 0.2763,
"step": 328
},
{
"epoch": 2.963963963963964,
"grad_norm": 0.28496550173938856,
"learning_rate": 5e-06,
"loss": 0.2762,
"step": 329
},
{
"epoch": 2.972972972972973,
"grad_norm": 0.2567341688659859,
"learning_rate": 5e-06,
"loss": 0.2756,
"step": 330
},
{
"epoch": 2.981981981981982,
"grad_norm": 0.2584671428651672,
"learning_rate": 5e-06,
"loss": 0.3001,
"step": 331
},
{
"epoch": 2.990990990990991,
"grad_norm": 0.2804525292556161,
"learning_rate": 5e-06,
"loss": 0.2785,
"step": 332
},
{
"epoch": 3.0,
"grad_norm": 0.24187503112431247,
"learning_rate": 5e-06,
"loss": 0.2454,
"step": 333
},
{
"epoch": 3.009009009009009,
"grad_norm": 0.26503328616806615,
"learning_rate": 5e-06,
"loss": 0.2332,
"step": 334
},
{
"epoch": 3.018018018018018,
"grad_norm": 0.2631846355641096,
"learning_rate": 5e-06,
"loss": 0.2658,
"step": 335
},
{
"epoch": 3.027027027027027,
"grad_norm": 0.2786137851508687,
"learning_rate": 5e-06,
"loss": 0.2519,
"step": 336
},
{
"epoch": 3.036036036036036,
"grad_norm": 0.2755722489630619,
"learning_rate": 5e-06,
"loss": 0.206,
"step": 337
},
{
"epoch": 3.045045045045045,
"grad_norm": 0.2530795628224832,
"learning_rate": 5e-06,
"loss": 0.2029,
"step": 338
},
{
"epoch": 3.054054054054054,
"grad_norm": 0.25959049991529565,
"learning_rate": 5e-06,
"loss": 0.2371,
"step": 339
},
{
"epoch": 3.063063063063063,
"grad_norm": 0.2916294807412774,
"learning_rate": 5e-06,
"loss": 0.2556,
"step": 340
},
{
"epoch": 3.0720720720720722,
"grad_norm": 0.2790615318391773,
"learning_rate": 5e-06,
"loss": 0.2198,
"step": 341
},
{
"epoch": 3.081081081081081,
"grad_norm": 0.3203392671142568,
"learning_rate": 5e-06,
"loss": 0.2693,
"step": 342
},
{
"epoch": 3.09009009009009,
"grad_norm": 0.2576637679316666,
"learning_rate": 5e-06,
"loss": 0.2304,
"step": 343
},
{
"epoch": 3.099099099099099,
"grad_norm": 0.24928248944605377,
"learning_rate": 5e-06,
"loss": 0.2291,
"step": 344
},
{
"epoch": 3.108108108108108,
"grad_norm": 0.26793696602953165,
"learning_rate": 5e-06,
"loss": 0.2445,
"step": 345
},
{
"epoch": 3.1171171171171173,
"grad_norm": 0.2971915014155351,
"learning_rate": 5e-06,
"loss": 0.2416,
"step": 346
},
{
"epoch": 3.126126126126126,
"grad_norm": 0.27752473751722373,
"learning_rate": 5e-06,
"loss": 0.219,
"step": 347
},
{
"epoch": 3.135135135135135,
"grad_norm": 0.2794534994824032,
"learning_rate": 5e-06,
"loss": 0.2297,
"step": 348
},
{
"epoch": 3.144144144144144,
"grad_norm": 0.25926948541703204,
"learning_rate": 5e-06,
"loss": 0.2411,
"step": 349
},
{
"epoch": 3.153153153153153,
"grad_norm": 0.24722918054941584,
"learning_rate": 5e-06,
"loss": 0.2419,
"step": 350
},
{
"epoch": 3.1621621621621623,
"grad_norm": 0.26203389917376085,
"learning_rate": 5e-06,
"loss": 0.234,
"step": 351
},
{
"epoch": 3.171171171171171,
"grad_norm": 0.2472074514309984,
"learning_rate": 5e-06,
"loss": 0.2363,
"step": 352
},
{
"epoch": 3.18018018018018,
"grad_norm": 0.2945063702553609,
"learning_rate": 5e-06,
"loss": 0.2435,
"step": 353
},
{
"epoch": 3.189189189189189,
"grad_norm": 0.27925373635853185,
"learning_rate": 5e-06,
"loss": 0.2746,
"step": 354
},
{
"epoch": 3.1981981981981984,
"grad_norm": 0.24996405356591392,
"learning_rate": 5e-06,
"loss": 0.2357,
"step": 355
},
{
"epoch": 3.2072072072072073,
"grad_norm": 0.2556106250304069,
"learning_rate": 5e-06,
"loss": 0.2499,
"step": 356
},
{
"epoch": 3.2162162162162162,
"grad_norm": 0.26114830248277804,
"learning_rate": 5e-06,
"loss": 0.2435,
"step": 357
},
{
"epoch": 3.225225225225225,
"grad_norm": 0.2469470177899144,
"learning_rate": 5e-06,
"loss": 0.2391,
"step": 358
},
{
"epoch": 3.234234234234234,
"grad_norm": 0.2641345310685226,
"learning_rate": 5e-06,
"loss": 0.1981,
"step": 359
},
{
"epoch": 3.2432432432432434,
"grad_norm": 0.2630942786949833,
"learning_rate": 5e-06,
"loss": 0.2098,
"step": 360
},
{
"epoch": 3.2522522522522523,
"grad_norm": 0.24708329710543495,
"learning_rate": 5e-06,
"loss": 0.2228,
"step": 361
},
{
"epoch": 3.2612612612612613,
"grad_norm": 0.25000693689900794,
"learning_rate": 5e-06,
"loss": 0.242,
"step": 362
},
{
"epoch": 3.27027027027027,
"grad_norm": 0.2554644897448756,
"learning_rate": 5e-06,
"loss": 0.2558,
"step": 363
},
{
"epoch": 3.279279279279279,
"grad_norm": 0.25264038317978293,
"learning_rate": 5e-06,
"loss": 0.2009,
"step": 364
},
{
"epoch": 3.2882882882882885,
"grad_norm": 0.2743512388274681,
"learning_rate": 5e-06,
"loss": 0.2331,
"step": 365
},
{
"epoch": 3.2972972972972974,
"grad_norm": 0.2728238972210015,
"learning_rate": 5e-06,
"loss": 0.238,
"step": 366
},
{
"epoch": 3.3063063063063063,
"grad_norm": 0.2602352997656632,
"learning_rate": 5e-06,
"loss": 0.2595,
"step": 367
},
{
"epoch": 3.315315315315315,
"grad_norm": 0.27036311534944873,
"learning_rate": 5e-06,
"loss": 0.2093,
"step": 368
},
{
"epoch": 3.3243243243243246,
"grad_norm": 0.264625202176752,
"learning_rate": 5e-06,
"loss": 0.2579,
"step": 369
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.259895631515348,
"learning_rate": 5e-06,
"loss": 0.2375,
"step": 370
},
{
"epoch": 3.3423423423423424,
"grad_norm": 0.2563353260712296,
"learning_rate": 5e-06,
"loss": 0.2364,
"step": 371
},
{
"epoch": 3.3513513513513513,
"grad_norm": 0.28822107627305354,
"learning_rate": 5e-06,
"loss": 0.2493,
"step": 372
},
{
"epoch": 3.3603603603603602,
"grad_norm": 0.25680447088580227,
"learning_rate": 5e-06,
"loss": 0.2501,
"step": 373
},
{
"epoch": 3.3693693693693696,
"grad_norm": 0.27784185650966475,
"learning_rate": 5e-06,
"loss": 0.245,
"step": 374
},
{
"epoch": 3.3783783783783785,
"grad_norm": 0.2627541742958857,
"learning_rate": 5e-06,
"loss": 0.2387,
"step": 375
},
{
"epoch": 3.3873873873873874,
"grad_norm": 0.24193274859474298,
"learning_rate": 5e-06,
"loss": 0.2375,
"step": 376
},
{
"epoch": 3.3963963963963963,
"grad_norm": 0.258378796876473,
"learning_rate": 5e-06,
"loss": 0.2281,
"step": 377
},
{
"epoch": 3.4054054054054053,
"grad_norm": 0.2749899330352957,
"learning_rate": 5e-06,
"loss": 0.24,
"step": 378
},
{
"epoch": 3.4144144144144146,
"grad_norm": 0.25777164751813997,
"learning_rate": 5e-06,
"loss": 0.2524,
"step": 379
},
{
"epoch": 3.4234234234234235,
"grad_norm": 0.2805168544005753,
"learning_rate": 5e-06,
"loss": 0.2415,
"step": 380
},
{
"epoch": 3.4324324324324325,
"grad_norm": 0.25842839628916536,
"learning_rate": 5e-06,
"loss": 0.2433,
"step": 381
},
{
"epoch": 3.4414414414414414,
"grad_norm": 0.26639980982056893,
"learning_rate": 5e-06,
"loss": 0.2403,
"step": 382
},
{
"epoch": 3.4504504504504503,
"grad_norm": 0.3060982219805088,
"learning_rate": 5e-06,
"loss": 0.2236,
"step": 383
},
{
"epoch": 3.4594594594594597,
"grad_norm": 0.26146902459280136,
"learning_rate": 5e-06,
"loss": 0.2504,
"step": 384
},
{
"epoch": 3.4684684684684686,
"grad_norm": 0.25380491317438975,
"learning_rate": 5e-06,
"loss": 0.2464,
"step": 385
},
{
"epoch": 3.4774774774774775,
"grad_norm": 0.27324232509875496,
"learning_rate": 5e-06,
"loss": 0.2404,
"step": 386
},
{
"epoch": 3.4864864864864864,
"grad_norm": 0.2651723560610241,
"learning_rate": 5e-06,
"loss": 0.226,
"step": 387
},
{
"epoch": 3.4954954954954953,
"grad_norm": 0.2689389917124243,
"learning_rate": 5e-06,
"loss": 0.2751,
"step": 388
},
{
"epoch": 3.5045045045045047,
"grad_norm": 0.2643418768447757,
"learning_rate": 5e-06,
"loss": 0.2298,
"step": 389
},
{
"epoch": 3.5135135135135136,
"grad_norm": 0.24935046689303417,
"learning_rate": 5e-06,
"loss": 0.2505,
"step": 390
},
{
"epoch": 3.5225225225225225,
"grad_norm": 0.2508765789856499,
"learning_rate": 5e-06,
"loss": 0.2478,
"step": 391
},
{
"epoch": 3.5315315315315314,
"grad_norm": 0.26705709850776205,
"learning_rate": 5e-06,
"loss": 0.2225,
"step": 392
},
{
"epoch": 3.5405405405405403,
"grad_norm": 0.2573422869010653,
"learning_rate": 5e-06,
"loss": 0.2122,
"step": 393
},
{
"epoch": 3.5495495495495497,
"grad_norm": 0.2770154762726231,
"learning_rate": 5e-06,
"loss": 0.2802,
"step": 394
},
{
"epoch": 3.5585585585585586,
"grad_norm": 0.26710684568846427,
"learning_rate": 5e-06,
"loss": 0.2275,
"step": 395
},
{
"epoch": 3.5675675675675675,
"grad_norm": 0.2527476600992376,
"learning_rate": 5e-06,
"loss": 0.2445,
"step": 396
},
{
"epoch": 3.5765765765765765,
"grad_norm": 0.2521141774058005,
"learning_rate": 5e-06,
"loss": 0.2342,
"step": 397
},
{
"epoch": 3.5855855855855854,
"grad_norm": 0.2689995200221707,
"learning_rate": 5e-06,
"loss": 0.2518,
"step": 398
},
{
"epoch": 3.5945945945945947,
"grad_norm": 0.25908754443823273,
"learning_rate": 5e-06,
"loss": 0.2386,
"step": 399
},
{
"epoch": 3.6036036036036037,
"grad_norm": 0.273518168337783,
"learning_rate": 5e-06,
"loss": 0.2641,
"step": 400
},
{
"epoch": 3.6126126126126126,
"grad_norm": 0.26669639385445737,
"learning_rate": 5e-06,
"loss": 0.2359,
"step": 401
},
{
"epoch": 3.6216216216216215,
"grad_norm": 0.2560702170541,
"learning_rate": 5e-06,
"loss": 0.2337,
"step": 402
},
{
"epoch": 3.6306306306306304,
"grad_norm": 0.2461177958525498,
"learning_rate": 5e-06,
"loss": 0.2401,
"step": 403
},
{
"epoch": 3.6396396396396398,
"grad_norm": 0.2648097200804019,
"learning_rate": 5e-06,
"loss": 0.2144,
"step": 404
},
{
"epoch": 3.6486486486486487,
"grad_norm": 0.2646834290329095,
"learning_rate": 5e-06,
"loss": 0.2493,
"step": 405
},
{
"epoch": 3.6576576576576576,
"grad_norm": 0.2796973639180676,
"learning_rate": 5e-06,
"loss": 0.2467,
"step": 406
},
{
"epoch": 3.6666666666666665,
"grad_norm": 0.25308085485220105,
"learning_rate": 5e-06,
"loss": 0.227,
"step": 407
},
{
"epoch": 3.6756756756756754,
"grad_norm": 0.2587012503429008,
"learning_rate": 5e-06,
"loss": 0.2285,
"step": 408
},
{
"epoch": 3.684684684684685,
"grad_norm": 0.2958300777778266,
"learning_rate": 5e-06,
"loss": 0.2547,
"step": 409
},
{
"epoch": 3.6936936936936937,
"grad_norm": 0.25334395158267925,
"learning_rate": 5e-06,
"loss": 0.2693,
"step": 410
},
{
"epoch": 3.7027027027027026,
"grad_norm": 0.29019457155713096,
"learning_rate": 5e-06,
"loss": 0.2518,
"step": 411
},
{
"epoch": 3.7117117117117115,
"grad_norm": 0.2473020184393372,
"learning_rate": 5e-06,
"loss": 0.2344,
"step": 412
},
{
"epoch": 3.7207207207207205,
"grad_norm": 0.270453761649425,
"learning_rate": 5e-06,
"loss": 0.2275,
"step": 413
},
{
"epoch": 3.72972972972973,
"grad_norm": 0.2602131546551776,
"learning_rate": 5e-06,
"loss": 0.2428,
"step": 414
},
{
"epoch": 3.7387387387387387,
"grad_norm": 0.29110180180417683,
"learning_rate": 5e-06,
"loss": 0.2301,
"step": 415
},
{
"epoch": 3.7477477477477477,
"grad_norm": 0.25367703106621997,
"learning_rate": 5e-06,
"loss": 0.2377,
"step": 416
},
{
"epoch": 3.756756756756757,
"grad_norm": 0.257299738969486,
"learning_rate": 5e-06,
"loss": 0.2137,
"step": 417
},
{
"epoch": 3.7657657657657655,
"grad_norm": 0.257656312443973,
"learning_rate": 5e-06,
"loss": 0.2567,
"step": 418
},
{
"epoch": 3.774774774774775,
"grad_norm": 0.2808325095308855,
"learning_rate": 5e-06,
"loss": 0.2571,
"step": 419
},
{
"epoch": 3.7837837837837838,
"grad_norm": 0.2657618644204265,
"learning_rate": 5e-06,
"loss": 0.2382,
"step": 420
},
{
"epoch": 3.7927927927927927,
"grad_norm": 0.27556658674977147,
"learning_rate": 5e-06,
"loss": 0.2748,
"step": 421
},
{
"epoch": 3.801801801801802,
"grad_norm": 0.2783118243091199,
"learning_rate": 5e-06,
"loss": 0.2349,
"step": 422
},
{
"epoch": 3.810810810810811,
"grad_norm": 0.27683880390148435,
"learning_rate": 5e-06,
"loss": 0.2545,
"step": 423
},
{
"epoch": 3.81981981981982,
"grad_norm": 0.24903071725050696,
"learning_rate": 5e-06,
"loss": 0.2436,
"step": 424
},
{
"epoch": 3.828828828828829,
"grad_norm": 0.27140890180707533,
"learning_rate": 5e-06,
"loss": 0.2103,
"step": 425
},
{
"epoch": 3.8378378378378377,
"grad_norm": 0.25999693913444694,
"learning_rate": 5e-06,
"loss": 0.2703,
"step": 426
},
{
"epoch": 3.846846846846847,
"grad_norm": 0.28165585165926776,
"learning_rate": 5e-06,
"loss": 0.2395,
"step": 427
},
{
"epoch": 3.855855855855856,
"grad_norm": 0.26800670806664434,
"learning_rate": 5e-06,
"loss": 0.2855,
"step": 428
},
{
"epoch": 3.864864864864865,
"grad_norm": 0.26752171553410126,
"learning_rate": 5e-06,
"loss": 0.2525,
"step": 429
},
{
"epoch": 3.873873873873874,
"grad_norm": 0.2550812423474624,
"learning_rate": 5e-06,
"loss": 0.2357,
"step": 430
},
{
"epoch": 3.8828828828828827,
"grad_norm": 0.25341757674985854,
"learning_rate": 5e-06,
"loss": 0.2752,
"step": 431
},
{
"epoch": 3.891891891891892,
"grad_norm": 0.2714456590973952,
"learning_rate": 5e-06,
"loss": 0.2399,
"step": 432
},
{
"epoch": 3.900900900900901,
"grad_norm": 0.2832850264958553,
"learning_rate": 5e-06,
"loss": 0.232,
"step": 433
},
{
"epoch": 3.90990990990991,
"grad_norm": 0.2560994537050628,
"learning_rate": 5e-06,
"loss": 0.2457,
"step": 434
},
{
"epoch": 3.918918918918919,
"grad_norm": 0.2624403245035626,
"learning_rate": 5e-06,
"loss": 0.2782,
"step": 435
},
{
"epoch": 3.9279279279279278,
"grad_norm": 0.2645012258501843,
"learning_rate": 5e-06,
"loss": 0.2432,
"step": 436
},
{
"epoch": 3.936936936936937,
"grad_norm": 0.26607477226554654,
"learning_rate": 5e-06,
"loss": 0.2202,
"step": 437
},
{
"epoch": 3.945945945945946,
"grad_norm": 0.2731452758231204,
"learning_rate": 5e-06,
"loss": 0.2689,
"step": 438
},
{
"epoch": 3.954954954954955,
"grad_norm": 0.2964590337329977,
"learning_rate": 5e-06,
"loss": 0.229,
"step": 439
},
{
"epoch": 3.963963963963964,
"grad_norm": 0.2787999534447745,
"learning_rate": 5e-06,
"loss": 0.2527,
"step": 440
},
{
"epoch": 3.972972972972973,
"grad_norm": 0.24055312465968123,
"learning_rate": 5e-06,
"loss": 0.2231,
"step": 441
},
{
"epoch": 3.981981981981982,
"grad_norm": 0.2757745274177008,
"learning_rate": 5e-06,
"loss": 0.2437,
"step": 442
},
{
"epoch": 3.990990990990991,
"grad_norm": 0.26536706718909975,
"learning_rate": 5e-06,
"loss": 0.2335,
"step": 443
},
{
"epoch": 4.0,
"grad_norm": 0.2390963333912312,
"learning_rate": 5e-06,
"loss": 0.2237,
"step": 444
},
{
"epoch": 4.009009009009009,
"grad_norm": 0.2720207934109716,
"learning_rate": 5e-06,
"loss": 0.2316,
"step": 445
},
{
"epoch": 4.018018018018018,
"grad_norm": 0.2673459557274162,
"learning_rate": 5e-06,
"loss": 0.2205,
"step": 446
},
{
"epoch": 4.027027027027027,
"grad_norm": 0.24447403903164172,
"learning_rate": 5e-06,
"loss": 0.2033,
"step": 447
},
{
"epoch": 4.036036036036036,
"grad_norm": 0.29354577394627634,
"learning_rate": 5e-06,
"loss": 0.2059,
"step": 448
},
{
"epoch": 4.045045045045045,
"grad_norm": 0.28252004790921936,
"learning_rate": 5e-06,
"loss": 0.2222,
"step": 449
},
{
"epoch": 4.054054054054054,
"grad_norm": 0.279624558559084,
"learning_rate": 5e-06,
"loss": 0.2485,
"step": 450
},
{
"epoch": 4.063063063063063,
"grad_norm": 0.2742544682456035,
"learning_rate": 5e-06,
"loss": 0.2153,
"step": 451
},
{
"epoch": 4.072072072072072,
"grad_norm": 0.26315979594288036,
"learning_rate": 5e-06,
"loss": 0.1967,
"step": 452
},
{
"epoch": 4.081081081081081,
"grad_norm": 0.25548950244986113,
"learning_rate": 5e-06,
"loss": 0.2108,
"step": 453
},
{
"epoch": 4.09009009009009,
"grad_norm": 0.260763131351132,
"learning_rate": 5e-06,
"loss": 0.21,
"step": 454
},
{
"epoch": 4.099099099099099,
"grad_norm": 0.2705243300559351,
"learning_rate": 5e-06,
"loss": 0.1926,
"step": 455
},
{
"epoch": 4.108108108108108,
"grad_norm": 0.2572296275587587,
"learning_rate": 5e-06,
"loss": 0.2182,
"step": 456
},
{
"epoch": 4.117117117117117,
"grad_norm": 0.32370825872086306,
"learning_rate": 5e-06,
"loss": 0.1912,
"step": 457
},
{
"epoch": 4.126126126126126,
"grad_norm": 0.24556795850306926,
"learning_rate": 5e-06,
"loss": 0.2071,
"step": 458
},
{
"epoch": 4.135135135135135,
"grad_norm": 0.23389148126428516,
"learning_rate": 5e-06,
"loss": 0.234,
"step": 459
},
{
"epoch": 4.1441441441441444,
"grad_norm": 0.2428236778448457,
"learning_rate": 5e-06,
"loss": 0.2119,
"step": 460
},
{
"epoch": 4.153153153153153,
"grad_norm": 0.31106881930176683,
"learning_rate": 5e-06,
"loss": 0.2124,
"step": 461
},
{
"epoch": 4.162162162162162,
"grad_norm": 0.27122185214756195,
"learning_rate": 5e-06,
"loss": 0.2226,
"step": 462
},
{
"epoch": 4.171171171171171,
"grad_norm": 0.2996732981773459,
"learning_rate": 5e-06,
"loss": 0.2299,
"step": 463
},
{
"epoch": 4.18018018018018,
"grad_norm": 0.27023462008753,
"learning_rate": 5e-06,
"loss": 0.2557,
"step": 464
},
{
"epoch": 4.1891891891891895,
"grad_norm": 0.25842796305339033,
"learning_rate": 5e-06,
"loss": 0.2325,
"step": 465
},
{
"epoch": 4.198198198198198,
"grad_norm": 0.2437169161762717,
"learning_rate": 5e-06,
"loss": 0.1797,
"step": 466
},
{
"epoch": 4.207207207207207,
"grad_norm": 0.26780073229070595,
"learning_rate": 5e-06,
"loss": 0.2067,
"step": 467
},
{
"epoch": 4.216216216216216,
"grad_norm": 0.2670888124294135,
"learning_rate": 5e-06,
"loss": 0.2205,
"step": 468
},
{
"epoch": 4.225225225225225,
"grad_norm": 0.25879921020859936,
"learning_rate": 5e-06,
"loss": 0.2085,
"step": 469
},
{
"epoch": 4.2342342342342345,
"grad_norm": 0.26317981293875226,
"learning_rate": 5e-06,
"loss": 0.2336,
"step": 470
},
{
"epoch": 4.243243243243243,
"grad_norm": 0.23931715866089387,
"learning_rate": 5e-06,
"loss": 0.2124,
"step": 471
},
{
"epoch": 4.252252252252252,
"grad_norm": 0.2691126922298142,
"learning_rate": 5e-06,
"loss": 0.2065,
"step": 472
},
{
"epoch": 4.261261261261261,
"grad_norm": 0.23991879914940956,
"learning_rate": 5e-06,
"loss": 0.2045,
"step": 473
},
{
"epoch": 4.27027027027027,
"grad_norm": 0.2548563923839949,
"learning_rate": 5e-06,
"loss": 0.1836,
"step": 474
},
{
"epoch": 4.2792792792792795,
"grad_norm": 0.24697361737276458,
"learning_rate": 5e-06,
"loss": 0.1792,
"step": 475
},
{
"epoch": 4.288288288288288,
"grad_norm": 0.2829022630675641,
"learning_rate": 5e-06,
"loss": 0.2098,
"step": 476
},
{
"epoch": 4.297297297297297,
"grad_norm": 0.2620700761228271,
"learning_rate": 5e-06,
"loss": 0.2102,
"step": 477
},
{
"epoch": 4.306306306306306,
"grad_norm": 0.2628063026021744,
"learning_rate": 5e-06,
"loss": 0.2025,
"step": 478
},
{
"epoch": 4.315315315315315,
"grad_norm": 0.2863724297024661,
"learning_rate": 5e-06,
"loss": 0.237,
"step": 479
},
{
"epoch": 4.324324324324325,
"grad_norm": 0.25990971129318524,
"learning_rate": 5e-06,
"loss": 0.2116,
"step": 480
},
{
"epoch": 4.333333333333333,
"grad_norm": 0.2606038664504591,
"learning_rate": 5e-06,
"loss": 0.2092,
"step": 481
},
{
"epoch": 4.342342342342342,
"grad_norm": 0.253863280864317,
"learning_rate": 5e-06,
"loss": 0.164,
"step": 482
},
{
"epoch": 4.351351351351352,
"grad_norm": 0.24650022322727727,
"learning_rate": 5e-06,
"loss": 0.1852,
"step": 483
},
{
"epoch": 4.36036036036036,
"grad_norm": 0.25369962373757826,
"learning_rate": 5e-06,
"loss": 0.2183,
"step": 484
},
{
"epoch": 4.36936936936937,
"grad_norm": 0.28375278856958064,
"learning_rate": 5e-06,
"loss": 0.2052,
"step": 485
},
{
"epoch": 4.378378378378378,
"grad_norm": 0.24267201305207473,
"learning_rate": 5e-06,
"loss": 0.2367,
"step": 486
},
{
"epoch": 4.387387387387387,
"grad_norm": 0.25205690964559024,
"learning_rate": 5e-06,
"loss": 0.2212,
"step": 487
},
{
"epoch": 4.396396396396397,
"grad_norm": 0.25716800876310375,
"learning_rate": 5e-06,
"loss": 0.1906,
"step": 488
},
{
"epoch": 4.405405405405405,
"grad_norm": 0.23704968081876604,
"learning_rate": 5e-06,
"loss": 0.2075,
"step": 489
},
{
"epoch": 4.414414414414415,
"grad_norm": 0.3201956523912786,
"learning_rate": 5e-06,
"loss": 0.198,
"step": 490
},
{
"epoch": 4.423423423423423,
"grad_norm": 0.26301398337918436,
"learning_rate": 5e-06,
"loss": 0.1878,
"step": 491
},
{
"epoch": 4.4324324324324325,
"grad_norm": 0.27402028797210554,
"learning_rate": 5e-06,
"loss": 0.2104,
"step": 492
},
{
"epoch": 4.441441441441442,
"grad_norm": 0.33955450203665727,
"learning_rate": 5e-06,
"loss": 0.2205,
"step": 493
},
{
"epoch": 4.45045045045045,
"grad_norm": 0.26220441610422024,
"learning_rate": 5e-06,
"loss": 0.218,
"step": 494
},
{
"epoch": 4.45945945945946,
"grad_norm": 0.281656218031479,
"learning_rate": 5e-06,
"loss": 0.2235,
"step": 495
},
{
"epoch": 4.468468468468468,
"grad_norm": 0.29159551817654267,
"learning_rate": 5e-06,
"loss": 0.2178,
"step": 496
},
{
"epoch": 4.4774774774774775,
"grad_norm": 0.2623117148967965,
"learning_rate": 5e-06,
"loss": 0.2178,
"step": 497
},
{
"epoch": 4.486486486486487,
"grad_norm": 0.2531840506893455,
"learning_rate": 5e-06,
"loss": 0.2086,
"step": 498
},
{
"epoch": 4.495495495495495,
"grad_norm": 0.25528977769788064,
"learning_rate": 5e-06,
"loss": 0.2186,
"step": 499
},
{
"epoch": 4.504504504504505,
"grad_norm": 0.2679628655435481,
"learning_rate": 5e-06,
"loss": 0.2237,
"step": 500
},
{
"epoch": 4.513513513513513,
"grad_norm": 0.263719988749634,
"learning_rate": 5e-06,
"loss": 0.194,
"step": 501
},
{
"epoch": 4.5225225225225225,
"grad_norm": 0.273138889734998,
"learning_rate": 5e-06,
"loss": 0.2086,
"step": 502
},
{
"epoch": 4.531531531531532,
"grad_norm": 0.292878429342998,
"learning_rate": 5e-06,
"loss": 0.2154,
"step": 503
},
{
"epoch": 4.54054054054054,
"grad_norm": 0.27619815070018144,
"learning_rate": 5e-06,
"loss": 0.2049,
"step": 504
},
{
"epoch": 4.54954954954955,
"grad_norm": 0.27527630799114594,
"learning_rate": 5e-06,
"loss": 0.2213,
"step": 505
},
{
"epoch": 4.558558558558558,
"grad_norm": 0.26879133234631997,
"learning_rate": 5e-06,
"loss": 0.2238,
"step": 506
},
{
"epoch": 4.5675675675675675,
"grad_norm": 0.272548643979066,
"learning_rate": 5e-06,
"loss": 0.2114,
"step": 507
},
{
"epoch": 4.576576576576577,
"grad_norm": 0.27819059711468064,
"learning_rate": 5e-06,
"loss": 0.2157,
"step": 508
},
{
"epoch": 4.585585585585585,
"grad_norm": 0.27618387944584083,
"learning_rate": 5e-06,
"loss": 0.2048,
"step": 509
},
{
"epoch": 4.594594594594595,
"grad_norm": 0.2549425189875316,
"learning_rate": 5e-06,
"loss": 0.2174,
"step": 510
},
{
"epoch": 4.603603603603604,
"grad_norm": 0.2645903835474375,
"learning_rate": 5e-06,
"loss": 0.2216,
"step": 511
},
{
"epoch": 4.612612612612613,
"grad_norm": 0.2640684028376376,
"learning_rate": 5e-06,
"loss": 0.2182,
"step": 512
},
{
"epoch": 4.621621621621622,
"grad_norm": 0.26051198776980117,
"learning_rate": 5e-06,
"loss": 0.1808,
"step": 513
},
{
"epoch": 4.63063063063063,
"grad_norm": 0.2931023356142575,
"learning_rate": 5e-06,
"loss": 0.1941,
"step": 514
},
{
"epoch": 4.63963963963964,
"grad_norm": 0.25284181276914397,
"learning_rate": 5e-06,
"loss": 0.2362,
"step": 515
},
{
"epoch": 4.648648648648649,
"grad_norm": 0.2590084071736973,
"learning_rate": 5e-06,
"loss": 0.2417,
"step": 516
},
{
"epoch": 4.657657657657658,
"grad_norm": 0.30404451969520124,
"learning_rate": 5e-06,
"loss": 0.2186,
"step": 517
},
{
"epoch": 4.666666666666667,
"grad_norm": 0.2673580882682002,
"learning_rate": 5e-06,
"loss": 0.2224,
"step": 518
},
{
"epoch": 4.675675675675675,
"grad_norm": 0.2636588657441614,
"learning_rate": 5e-06,
"loss": 0.2324,
"step": 519
},
{
"epoch": 4.684684684684685,
"grad_norm": 0.2876900527962799,
"learning_rate": 5e-06,
"loss": 0.2246,
"step": 520
},
{
"epoch": 4.693693693693694,
"grad_norm": 0.33566773437219993,
"learning_rate": 5e-06,
"loss": 0.2148,
"step": 521
},
{
"epoch": 4.702702702702703,
"grad_norm": 0.25837694824532986,
"learning_rate": 5e-06,
"loss": 0.2435,
"step": 522
},
{
"epoch": 4.711711711711712,
"grad_norm": 0.2618996341262811,
"learning_rate": 5e-06,
"loss": 0.2504,
"step": 523
},
{
"epoch": 4.7207207207207205,
"grad_norm": 0.2916721768764094,
"learning_rate": 5e-06,
"loss": 0.1969,
"step": 524
},
{
"epoch": 4.72972972972973,
"grad_norm": 0.2695124629228616,
"learning_rate": 5e-06,
"loss": 0.2242,
"step": 525
},
{
"epoch": 4.738738738738739,
"grad_norm": 0.25003767055634085,
"learning_rate": 5e-06,
"loss": 0.2396,
"step": 526
},
{
"epoch": 4.747747747747748,
"grad_norm": 0.26273587385726827,
"learning_rate": 5e-06,
"loss": 0.2389,
"step": 527
},
{
"epoch": 4.756756756756757,
"grad_norm": 0.2633999928270432,
"learning_rate": 5e-06,
"loss": 0.1999,
"step": 528
},
{
"epoch": 4.7657657657657655,
"grad_norm": 0.2657486138733691,
"learning_rate": 5e-06,
"loss": 0.1973,
"step": 529
},
{
"epoch": 4.774774774774775,
"grad_norm": 0.2615424263109113,
"learning_rate": 5e-06,
"loss": 0.2172,
"step": 530
},
{
"epoch": 4.783783783783784,
"grad_norm": 0.2725460425087256,
"learning_rate": 5e-06,
"loss": 0.2316,
"step": 531
},
{
"epoch": 4.792792792792793,
"grad_norm": 0.29663406158664646,
"learning_rate": 5e-06,
"loss": 0.215,
"step": 532
},
{
"epoch": 4.801801801801802,
"grad_norm": 0.2680114226198382,
"learning_rate": 5e-06,
"loss": 0.1913,
"step": 533
},
{
"epoch": 4.8108108108108105,
"grad_norm": 0.2717779322025023,
"learning_rate": 5e-06,
"loss": 0.2134,
"step": 534
},
{
"epoch": 4.81981981981982,
"grad_norm": 0.2461871817136421,
"learning_rate": 5e-06,
"loss": 0.2242,
"step": 535
},
{
"epoch": 4.828828828828829,
"grad_norm": 0.23898230675599963,
"learning_rate": 5e-06,
"loss": 0.1906,
"step": 536
},
{
"epoch": 4.837837837837838,
"grad_norm": 0.24493103786606743,
"learning_rate": 5e-06,
"loss": 0.2157,
"step": 537
},
{
"epoch": 4.846846846846847,
"grad_norm": 0.2513533399485069,
"learning_rate": 5e-06,
"loss": 0.2251,
"step": 538
},
{
"epoch": 4.8558558558558556,
"grad_norm": 0.25335345934289205,
"learning_rate": 5e-06,
"loss": 0.1981,
"step": 539
},
{
"epoch": 4.864864864864865,
"grad_norm": 0.24569861483369518,
"learning_rate": 5e-06,
"loss": 0.1923,
"step": 540
},
{
"epoch": 4.873873873873874,
"grad_norm": 0.3107988513160903,
"learning_rate": 5e-06,
"loss": 0.2022,
"step": 541
},
{
"epoch": 4.882882882882883,
"grad_norm": 0.2440474159047901,
"learning_rate": 5e-06,
"loss": 0.2102,
"step": 542
},
{
"epoch": 4.891891891891892,
"grad_norm": 0.269910179414699,
"learning_rate": 5e-06,
"loss": 0.2023,
"step": 543
},
{
"epoch": 4.900900900900901,
"grad_norm": 0.28697178165278897,
"learning_rate": 5e-06,
"loss": 0.2026,
"step": 544
},
{
"epoch": 4.90990990990991,
"grad_norm": 0.27623354559228885,
"learning_rate": 5e-06,
"loss": 0.2477,
"step": 545
},
{
"epoch": 4.918918918918919,
"grad_norm": 0.2598034982021407,
"learning_rate": 5e-06,
"loss": 0.202,
"step": 546
},
{
"epoch": 4.927927927927928,
"grad_norm": 0.2982050262473221,
"learning_rate": 5e-06,
"loss": 0.1987,
"step": 547
},
{
"epoch": 4.936936936936937,
"grad_norm": 0.2506438136769937,
"learning_rate": 5e-06,
"loss": 0.2332,
"step": 548
},
{
"epoch": 4.945945945945946,
"grad_norm": 0.2619541945846186,
"learning_rate": 5e-06,
"loss": 0.1929,
"step": 549
},
{
"epoch": 4.954954954954955,
"grad_norm": 0.263321542176826,
"learning_rate": 5e-06,
"loss": 0.199,
"step": 550
},
{
"epoch": 4.963963963963964,
"grad_norm": 0.2601674233941214,
"learning_rate": 5e-06,
"loss": 0.2515,
"step": 551
},
{
"epoch": 4.972972972972973,
"grad_norm": 0.29934162295077227,
"learning_rate": 5e-06,
"loss": 0.2177,
"step": 552
},
{
"epoch": 4.981981981981982,
"grad_norm": 0.24535555883333707,
"learning_rate": 5e-06,
"loss": 0.2038,
"step": 553
},
{
"epoch": 4.990990990990991,
"grad_norm": 0.2743717244598402,
"learning_rate": 5e-06,
"loss": 0.1904,
"step": 554
},
{
"epoch": 5.0,
"grad_norm": 0.2346279054988279,
"learning_rate": 5e-06,
"loss": 0.2318,
"step": 555
},
{
"epoch": 5.009009009009009,
"grad_norm": 0.3139762280953865,
"learning_rate": 5e-06,
"loss": 0.2065,
"step": 556
},
{
"epoch": 5.018018018018018,
"grad_norm": 0.2318927222535076,
"learning_rate": 5e-06,
"loss": 0.2063,
"step": 557
},
{
"epoch": 5.027027027027027,
"grad_norm": 0.25650614267529076,
"learning_rate": 5e-06,
"loss": 0.2042,
"step": 558
},
{
"epoch": 5.036036036036036,
"grad_norm": 0.25768317605269925,
"learning_rate": 5e-06,
"loss": 0.2057,
"step": 559
},
{
"epoch": 5.045045045045045,
"grad_norm": 0.29060238578973707,
"learning_rate": 5e-06,
"loss": 0.1944,
"step": 560
},
{
"epoch": 5.054054054054054,
"grad_norm": 0.28407299845741896,
"learning_rate": 5e-06,
"loss": 0.1718,
"step": 561
},
{
"epoch": 5.063063063063063,
"grad_norm": 0.29213793767158686,
"learning_rate": 5e-06,
"loss": 0.1878,
"step": 562
},
{
"epoch": 5.072072072072072,
"grad_norm": 0.26810675570875164,
"learning_rate": 5e-06,
"loss": 0.2125,
"step": 563
},
{
"epoch": 5.081081081081081,
"grad_norm": 0.2692377641775085,
"learning_rate": 5e-06,
"loss": 0.1846,
"step": 564
},
{
"epoch": 5.09009009009009,
"grad_norm": 0.405649877673358,
"learning_rate": 5e-06,
"loss": 0.1837,
"step": 565
},
{
"epoch": 5.099099099099099,
"grad_norm": 0.26726682072971775,
"learning_rate": 5e-06,
"loss": 0.2062,
"step": 566
},
{
"epoch": 5.108108108108108,
"grad_norm": 0.2940841675590565,
"learning_rate": 5e-06,
"loss": 0.2165,
"step": 567
},
{
"epoch": 5.117117117117117,
"grad_norm": 0.3398159316706572,
"learning_rate": 5e-06,
"loss": 0.1926,
"step": 568
},
{
"epoch": 5.126126126126126,
"grad_norm": 0.2826251512922728,
"learning_rate": 5e-06,
"loss": 0.1848,
"step": 569
},
{
"epoch": 5.135135135135135,
"grad_norm": 0.25092563468699364,
"learning_rate": 5e-06,
"loss": 0.193,
"step": 570
},
{
"epoch": 5.1441441441441444,
"grad_norm": 0.25159248777659954,
"learning_rate": 5e-06,
"loss": 0.1723,
"step": 571
},
{
"epoch": 5.153153153153153,
"grad_norm": 0.2681017671845892,
"learning_rate": 5e-06,
"loss": 0.1634,
"step": 572
},
{
"epoch": 5.162162162162162,
"grad_norm": 0.2733469299319058,
"learning_rate": 5e-06,
"loss": 0.1702,
"step": 573
},
{
"epoch": 5.171171171171171,
"grad_norm": 0.2643697126326926,
"learning_rate": 5e-06,
"loss": 0.2108,
"step": 574
},
{
"epoch": 5.18018018018018,
"grad_norm": 0.2929652382664824,
"learning_rate": 5e-06,
"loss": 0.2104,
"step": 575
},
{
"epoch": 5.1891891891891895,
"grad_norm": 0.30518478646977765,
"learning_rate": 5e-06,
"loss": 0.2049,
"step": 576
},
{
"epoch": 5.198198198198198,
"grad_norm": 0.29565787595285775,
"learning_rate": 5e-06,
"loss": 0.1701,
"step": 577
},
{
"epoch": 5.207207207207207,
"grad_norm": 0.24799846379048632,
"learning_rate": 5e-06,
"loss": 0.1849,
"step": 578
},
{
"epoch": 5.216216216216216,
"grad_norm": 0.26812878158143444,
"learning_rate": 5e-06,
"loss": 0.1939,
"step": 579
},
{
"epoch": 5.225225225225225,
"grad_norm": 0.2832327785366025,
"learning_rate": 5e-06,
"loss": 0.1748,
"step": 580
},
{
"epoch": 5.2342342342342345,
"grad_norm": 0.24530353488882148,
"learning_rate": 5e-06,
"loss": 0.1672,
"step": 581
},
{
"epoch": 5.243243243243243,
"grad_norm": 0.267893260322143,
"learning_rate": 5e-06,
"loss": 0.1982,
"step": 582
},
{
"epoch": 5.252252252252252,
"grad_norm": 0.28205728775241223,
"learning_rate": 5e-06,
"loss": 0.1402,
"step": 583
},
{
"epoch": 5.261261261261261,
"grad_norm": 0.2616195565718879,
"learning_rate": 5e-06,
"loss": 0.1669,
"step": 584
},
{
"epoch": 5.27027027027027,
"grad_norm": 0.2623448971573745,
"learning_rate": 5e-06,
"loss": 0.2006,
"step": 585
},
{
"epoch": 5.2792792792792795,
"grad_norm": 0.24193944254287217,
"learning_rate": 5e-06,
"loss": 0.1737,
"step": 586
},
{
"epoch": 5.288288288288288,
"grad_norm": 0.27208641316196014,
"learning_rate": 5e-06,
"loss": 0.1837,
"step": 587
},
{
"epoch": 5.297297297297297,
"grad_norm": 0.25067910651417047,
"learning_rate": 5e-06,
"loss": 0.1934,
"step": 588
},
{
"epoch": 5.306306306306306,
"grad_norm": 0.25385900871383876,
"learning_rate": 5e-06,
"loss": 0.1689,
"step": 589
},
{
"epoch": 5.315315315315315,
"grad_norm": 0.32902079040677734,
"learning_rate": 5e-06,
"loss": 0.1534,
"step": 590
},
{
"epoch": 5.324324324324325,
"grad_norm": 0.2529027343155485,
"learning_rate": 5e-06,
"loss": 0.2102,
"step": 591
},
{
"epoch": 5.333333333333333,
"grad_norm": 0.28906659508958055,
"learning_rate": 5e-06,
"loss": 0.181,
"step": 592
},
{
"epoch": 5.342342342342342,
"grad_norm": 0.282108480924088,
"learning_rate": 5e-06,
"loss": 0.2128,
"step": 593
},
{
"epoch": 5.351351351351352,
"grad_norm": 0.2604161116106256,
"learning_rate": 5e-06,
"loss": 0.1844,
"step": 594
},
{
"epoch": 5.36036036036036,
"grad_norm": 0.2789492989923241,
"learning_rate": 5e-06,
"loss": 0.1683,
"step": 595
},
{
"epoch": 5.36936936936937,
"grad_norm": 0.2559431308271593,
"learning_rate": 5e-06,
"loss": 0.2137,
"step": 596
},
{
"epoch": 5.378378378378378,
"grad_norm": 0.30088029917481107,
"learning_rate": 5e-06,
"loss": 0.1892,
"step": 597
},
{
"epoch": 5.387387387387387,
"grad_norm": 0.26253812275245714,
"learning_rate": 5e-06,
"loss": 0.1765,
"step": 598
},
{
"epoch": 5.396396396396397,
"grad_norm": 0.26495943964336816,
"learning_rate": 5e-06,
"loss": 0.1843,
"step": 599
},
{
"epoch": 5.405405405405405,
"grad_norm": 0.25894821975432253,
"learning_rate": 5e-06,
"loss": 0.1881,
"step": 600
},
{
"epoch": 5.414414414414415,
"grad_norm": 0.24931805970093998,
"learning_rate": 5e-06,
"loss": 0.1878,
"step": 601
},
{
"epoch": 5.423423423423423,
"grad_norm": 0.23455479372929255,
"learning_rate": 5e-06,
"loss": 0.1906,
"step": 602
},
{
"epoch": 5.4324324324324325,
"grad_norm": 0.25467507848802673,
"learning_rate": 5e-06,
"loss": 0.1717,
"step": 603
},
{
"epoch": 5.441441441441442,
"grad_norm": 0.33202611172740315,
"learning_rate": 5e-06,
"loss": 0.1688,
"step": 604
},
{
"epoch": 5.45045045045045,
"grad_norm": 0.29109320447844156,
"learning_rate": 5e-06,
"loss": 0.2029,
"step": 605
},
{
"epoch": 5.45945945945946,
"grad_norm": 0.24981105367499418,
"learning_rate": 5e-06,
"loss": 0.1941,
"step": 606
},
{
"epoch": 5.468468468468468,
"grad_norm": 0.24367601204155379,
"learning_rate": 5e-06,
"loss": 0.1671,
"step": 607
},
{
"epoch": 5.4774774774774775,
"grad_norm": 0.2932072155115799,
"learning_rate": 5e-06,
"loss": 0.1705,
"step": 608
},
{
"epoch": 5.486486486486487,
"grad_norm": 0.2882005482228378,
"learning_rate": 5e-06,
"loss": 0.2065,
"step": 609
},
{
"epoch": 5.495495495495495,
"grad_norm": 0.25719022643699463,
"learning_rate": 5e-06,
"loss": 0.2111,
"step": 610
},
{
"epoch": 5.504504504504505,
"grad_norm": 0.2611846325545377,
"learning_rate": 5e-06,
"loss": 0.2016,
"step": 611
},
{
"epoch": 5.513513513513513,
"grad_norm": 0.23251839540489064,
"learning_rate": 5e-06,
"loss": 0.1711,
"step": 612
},
{
"epoch": 5.5225225225225225,
"grad_norm": 0.3956644880260737,
"learning_rate": 5e-06,
"loss": 0.1794,
"step": 613
},
{
"epoch": 5.531531531531532,
"grad_norm": 0.27250839467887433,
"learning_rate": 5e-06,
"loss": 0.1676,
"step": 614
},
{
"epoch": 5.54054054054054,
"grad_norm": 0.2638663157341973,
"learning_rate": 5e-06,
"loss": 0.189,
"step": 615
},
{
"epoch": 5.54954954954955,
"grad_norm": 0.2635087015420886,
"learning_rate": 5e-06,
"loss": 0.1894,
"step": 616
},
{
"epoch": 5.558558558558558,
"grad_norm": 0.25884441144311887,
"learning_rate": 5e-06,
"loss": 0.1625,
"step": 617
},
{
"epoch": 5.5675675675675675,
"grad_norm": 0.33989481367732455,
"learning_rate": 5e-06,
"loss": 0.1632,
"step": 618
},
{
"epoch": 5.576576576576577,
"grad_norm": 0.33951958077722966,
"learning_rate": 5e-06,
"loss": 0.1956,
"step": 619
},
{
"epoch": 5.585585585585585,
"grad_norm": 0.2547652235180218,
"learning_rate": 5e-06,
"loss": 0.2291,
"step": 620
},
{
"epoch": 5.594594594594595,
"grad_norm": 0.24750295719042112,
"learning_rate": 5e-06,
"loss": 0.1726,
"step": 621
},
{
"epoch": 5.603603603603604,
"grad_norm": 0.2508541551180729,
"learning_rate": 5e-06,
"loss": 0.1776,
"step": 622
},
{
"epoch": 5.612612612612613,
"grad_norm": 0.2506039467248062,
"learning_rate": 5e-06,
"loss": 0.2185,
"step": 623
},
{
"epoch": 5.621621621621622,
"grad_norm": 0.26927980609500457,
"learning_rate": 5e-06,
"loss": 0.2051,
"step": 624
},
{
"epoch": 5.63063063063063,
"grad_norm": 0.2902598041361342,
"learning_rate": 5e-06,
"loss": 0.1836,
"step": 625
},
{
"epoch": 5.63963963963964,
"grad_norm": 0.27400647533943007,
"learning_rate": 5e-06,
"loss": 0.187,
"step": 626
},
{
"epoch": 5.648648648648649,
"grad_norm": 0.29199710457207273,
"learning_rate": 5e-06,
"loss": 0.1969,
"step": 627
},
{
"epoch": 5.657657657657658,
"grad_norm": 0.3025760209241755,
"learning_rate": 5e-06,
"loss": 0.1745,
"step": 628
},
{
"epoch": 5.666666666666667,
"grad_norm": 0.2863200552497931,
"learning_rate": 5e-06,
"loss": 0.1763,
"step": 629
},
{
"epoch": 5.675675675675675,
"grad_norm": 0.3046187504171871,
"learning_rate": 5e-06,
"loss": 0.1765,
"step": 630
},
{
"epoch": 5.684684684684685,
"grad_norm": 0.2594010152562734,
"learning_rate": 5e-06,
"loss": 0.1922,
"step": 631
},
{
"epoch": 5.693693693693694,
"grad_norm": 0.25276964471192975,
"learning_rate": 5e-06,
"loss": 0.2011,
"step": 632
},
{
"epoch": 5.702702702702703,
"grad_norm": 0.2457717647956263,
"learning_rate": 5e-06,
"loss": 0.1878,
"step": 633
},
{
"epoch": 5.711711711711712,
"grad_norm": 0.27348692878164155,
"learning_rate": 5e-06,
"loss": 0.1992,
"step": 634
},
{
"epoch": 5.7207207207207205,
"grad_norm": 0.2599835122727351,
"learning_rate": 5e-06,
"loss": 0.1649,
"step": 635
},
{
"epoch": 5.72972972972973,
"grad_norm": 0.2712466634459408,
"learning_rate": 5e-06,
"loss": 0.2089,
"step": 636
},
{
"epoch": 5.738738738738739,
"grad_norm": 0.2732874374632016,
"learning_rate": 5e-06,
"loss": 0.1613,
"step": 637
},
{
"epoch": 5.747747747747748,
"grad_norm": 0.3335330997010001,
"learning_rate": 5e-06,
"loss": 0.2057,
"step": 638
},
{
"epoch": 5.756756756756757,
"grad_norm": 0.29795878105581997,
"learning_rate": 5e-06,
"loss": 0.1619,
"step": 639
},
{
"epoch": 5.7657657657657655,
"grad_norm": 0.27969811406236256,
"learning_rate": 5e-06,
"loss": 0.212,
"step": 640
},
{
"epoch": 5.774774774774775,
"grad_norm": 0.26108347760571876,
"learning_rate": 5e-06,
"loss": 0.1898,
"step": 641
},
{
"epoch": 5.783783783783784,
"grad_norm": 0.2954357533804664,
"learning_rate": 5e-06,
"loss": 0.1845,
"step": 642
},
{
"epoch": 5.792792792792793,
"grad_norm": 0.2897237531310712,
"learning_rate": 5e-06,
"loss": 0.1778,
"step": 643
},
{
"epoch": 5.801801801801802,
"grad_norm": 0.2862916112801224,
"learning_rate": 5e-06,
"loss": 0.1616,
"step": 644
},
{
"epoch": 5.8108108108108105,
"grad_norm": 0.2655242503575118,
"learning_rate": 5e-06,
"loss": 0.2125,
"step": 645
},
{
"epoch": 5.81981981981982,
"grad_norm": 0.29888735697965757,
"learning_rate": 5e-06,
"loss": 0.17,
"step": 646
},
{
"epoch": 5.828828828828829,
"grad_norm": 0.3878759366680586,
"learning_rate": 5e-06,
"loss": 0.1747,
"step": 647
},
{
"epoch": 5.837837837837838,
"grad_norm": 0.2780142798396489,
"learning_rate": 5e-06,
"loss": 0.208,
"step": 648
},
{
"epoch": 5.846846846846847,
"grad_norm": 0.32189822402929597,
"learning_rate": 5e-06,
"loss": 0.1903,
"step": 649
},
{
"epoch": 5.8558558558558556,
"grad_norm": 0.26666263423680436,
"learning_rate": 5e-06,
"loss": 0.1753,
"step": 650
},
{
"epoch": 5.864864864864865,
"grad_norm": 0.26508147192359016,
"learning_rate": 5e-06,
"loss": 0.1956,
"step": 651
},
{
"epoch": 5.873873873873874,
"grad_norm": 0.2800648058751269,
"learning_rate": 5e-06,
"loss": 0.1796,
"step": 652
},
{
"epoch": 5.882882882882883,
"grad_norm": 0.25602425755319697,
"learning_rate": 5e-06,
"loss": 0.2028,
"step": 653
},
{
"epoch": 5.891891891891892,
"grad_norm": 0.296930917045288,
"learning_rate": 5e-06,
"loss": 0.1924,
"step": 654
},
{
"epoch": 5.900900900900901,
"grad_norm": 0.3452776489094155,
"learning_rate": 5e-06,
"loss": 0.1964,
"step": 655
},
{
"epoch": 5.90990990990991,
"grad_norm": 0.2910698717731606,
"learning_rate": 5e-06,
"loss": 0.1751,
"step": 656
},
{
"epoch": 5.918918918918919,
"grad_norm": 0.2591517828645954,
"learning_rate": 5e-06,
"loss": 0.187,
"step": 657
},
{
"epoch": 5.927927927927928,
"grad_norm": 0.32146446439072224,
"learning_rate": 5e-06,
"loss": 0.1945,
"step": 658
},
{
"epoch": 5.936936936936937,
"grad_norm": 0.29057113691944186,
"learning_rate": 5e-06,
"loss": 0.1796,
"step": 659
},
{
"epoch": 5.945945945945946,
"grad_norm": 0.2673772774533524,
"learning_rate": 5e-06,
"loss": 0.1871,
"step": 660
},
{
"epoch": 5.954954954954955,
"grad_norm": 0.25292557260096377,
"learning_rate": 5e-06,
"loss": 0.1748,
"step": 661
},
{
"epoch": 5.963963963963964,
"grad_norm": 0.24315435369686791,
"learning_rate": 5e-06,
"loss": 0.1838,
"step": 662
},
{
"epoch": 5.972972972972973,
"grad_norm": 0.30275438050027514,
"learning_rate": 5e-06,
"loss": 0.1633,
"step": 663
},
{
"epoch": 5.981981981981982,
"grad_norm": 0.28436057893273076,
"learning_rate": 5e-06,
"loss": 0.1876,
"step": 664
},
{
"epoch": 5.990990990990991,
"grad_norm": 0.28562922979220184,
"learning_rate": 5e-06,
"loss": 0.2009,
"step": 665
},
{
"epoch": 6.0,
"grad_norm": 0.2653424601600143,
"learning_rate": 5e-06,
"loss": 0.1905,
"step": 666
},
{
"epoch": 6.009009009009009,
"grad_norm": 0.2860376096075966,
"learning_rate": 5e-06,
"loss": 0.1957,
"step": 667
},
{
"epoch": 6.018018018018018,
"grad_norm": 0.25196665164096865,
"learning_rate": 5e-06,
"loss": 0.1563,
"step": 668
},
{
"epoch": 6.027027027027027,
"grad_norm": 0.24029344524647256,
"learning_rate": 5e-06,
"loss": 0.1879,
"step": 669
},
{
"epoch": 6.036036036036036,
"grad_norm": 0.2620085799429486,
"learning_rate": 5e-06,
"loss": 0.1811,
"step": 670
},
{
"epoch": 6.045045045045045,
"grad_norm": 0.27308115959180734,
"learning_rate": 5e-06,
"loss": 0.155,
"step": 671
},
{
"epoch": 6.054054054054054,
"grad_norm": 0.26803737868546207,
"learning_rate": 5e-06,
"loss": 0.1537,
"step": 672
},
{
"epoch": 6.063063063063063,
"grad_norm": 0.30441930072274076,
"learning_rate": 5e-06,
"loss": 0.1902,
"step": 673
},
{
"epoch": 6.072072072072072,
"grad_norm": 0.2465984202629159,
"learning_rate": 5e-06,
"loss": 0.1807,
"step": 674
},
{
"epoch": 6.081081081081081,
"grad_norm": 0.2674335217467193,
"learning_rate": 5e-06,
"loss": 0.1487,
"step": 675
},
{
"epoch": 6.09009009009009,
"grad_norm": 0.2905204800351543,
"learning_rate": 5e-06,
"loss": 0.1567,
"step": 676
},
{
"epoch": 6.099099099099099,
"grad_norm": 0.2954597077236978,
"learning_rate": 5e-06,
"loss": 0.1535,
"step": 677
},
{
"epoch": 6.108108108108108,
"grad_norm": 0.3045298345267351,
"learning_rate": 5e-06,
"loss": 0.1689,
"step": 678
},
{
"epoch": 6.117117117117117,
"grad_norm": 0.2740781768489349,
"learning_rate": 5e-06,
"loss": 0.1797,
"step": 679
},
{
"epoch": 6.126126126126126,
"grad_norm": 0.39475528450021763,
"learning_rate": 5e-06,
"loss": 0.1617,
"step": 680
},
{
"epoch": 6.135135135135135,
"grad_norm": 0.6235225287605396,
"learning_rate": 5e-06,
"loss": 0.1238,
"step": 681
},
{
"epoch": 6.1441441441441444,
"grad_norm": 0.26845753887421847,
"learning_rate": 5e-06,
"loss": 0.1511,
"step": 682
},
{
"epoch": 6.153153153153153,
"grad_norm": 0.3602960092750115,
"learning_rate": 5e-06,
"loss": 0.1678,
"step": 683
},
{
"epoch": 6.162162162162162,
"grad_norm": 0.33224893659794336,
"learning_rate": 5e-06,
"loss": 0.1412,
"step": 684
},
{
"epoch": 6.171171171171171,
"grad_norm": 0.24094527332728147,
"learning_rate": 5e-06,
"loss": 0.1695,
"step": 685
},
{
"epoch": 6.18018018018018,
"grad_norm": 0.341428905288911,
"learning_rate": 5e-06,
"loss": 0.1839,
"step": 686
},
{
"epoch": 6.1891891891891895,
"grad_norm": 0.2956801407312396,
"learning_rate": 5e-06,
"loss": 0.1692,
"step": 687
},
{
"epoch": 6.198198198198198,
"grad_norm": 0.3054866867274709,
"learning_rate": 5e-06,
"loss": 0.1546,
"step": 688
},
{
"epoch": 6.207207207207207,
"grad_norm": 0.23806841375933424,
"learning_rate": 5e-06,
"loss": 0.1554,
"step": 689
},
{
"epoch": 6.216216216216216,
"grad_norm": 0.24481339250784975,
"learning_rate": 5e-06,
"loss": 0.1953,
"step": 690
},
{
"epoch": 6.225225225225225,
"grad_norm": 0.3014128409778474,
"learning_rate": 5e-06,
"loss": 0.1674,
"step": 691
},
{
"epoch": 6.2342342342342345,
"grad_norm": 0.2708989701315342,
"learning_rate": 5e-06,
"loss": 0.1479,
"step": 692
},
{
"epoch": 6.243243243243243,
"grad_norm": 0.31024609108715306,
"learning_rate": 5e-06,
"loss": 0.1553,
"step": 693
},
{
"epoch": 6.252252252252252,
"grad_norm": 0.29134393470015496,
"learning_rate": 5e-06,
"loss": 0.1437,
"step": 694
},
{
"epoch": 6.261261261261261,
"grad_norm": 0.2793592485054197,
"learning_rate": 5e-06,
"loss": 0.1684,
"step": 695
},
{
"epoch": 6.27027027027027,
"grad_norm": 0.30498815020407055,
"learning_rate": 5e-06,
"loss": 0.1968,
"step": 696
},
{
"epoch": 6.2792792792792795,
"grad_norm": 0.2652672098205942,
"learning_rate": 5e-06,
"loss": 0.1565,
"step": 697
},
{
"epoch": 6.288288288288288,
"grad_norm": 0.3018458330908521,
"learning_rate": 5e-06,
"loss": 0.1726,
"step": 698
},
{
"epoch": 6.297297297297297,
"grad_norm": 0.2592172426217306,
"learning_rate": 5e-06,
"loss": 0.1656,
"step": 699
},
{
"epoch": 6.306306306306306,
"grad_norm": 0.2565291008895072,
"learning_rate": 5e-06,
"loss": 0.1981,
"step": 700
},
{
"epoch": 6.315315315315315,
"grad_norm": 0.2980025331247,
"learning_rate": 5e-06,
"loss": 0.1656,
"step": 701
},
{
"epoch": 6.324324324324325,
"grad_norm": 0.26039013888986284,
"learning_rate": 5e-06,
"loss": 0.1656,
"step": 702
},
{
"epoch": 6.333333333333333,
"grad_norm": 0.2568935923552546,
"learning_rate": 5e-06,
"loss": 0.1451,
"step": 703
},
{
"epoch": 6.342342342342342,
"grad_norm": 0.29222987592831656,
"learning_rate": 5e-06,
"loss": 0.161,
"step": 704
},
{
"epoch": 6.351351351351352,
"grad_norm": 0.2622511449178775,
"learning_rate": 5e-06,
"loss": 0.1553,
"step": 705
},
{
"epoch": 6.36036036036036,
"grad_norm": 0.2703894332854895,
"learning_rate": 5e-06,
"loss": 0.1803,
"step": 706
},
{
"epoch": 6.36936936936937,
"grad_norm": 0.2516505913848481,
"learning_rate": 5e-06,
"loss": 0.16,
"step": 707
},
{
"epoch": 6.378378378378378,
"grad_norm": 0.26750256687760715,
"learning_rate": 5e-06,
"loss": 0.1461,
"step": 708
},
{
"epoch": 6.387387387387387,
"grad_norm": 0.2539871109081379,
"learning_rate": 5e-06,
"loss": 0.1488,
"step": 709
},
{
"epoch": 6.396396396396397,
"grad_norm": 0.2769403607227516,
"learning_rate": 5e-06,
"loss": 0.1924,
"step": 710
},
{
"epoch": 6.405405405405405,
"grad_norm": 0.2946720991492928,
"learning_rate": 5e-06,
"loss": 0.1503,
"step": 711
},
{
"epoch": 6.414414414414415,
"grad_norm": 0.24458166835948247,
"learning_rate": 5e-06,
"loss": 0.1855,
"step": 712
},
{
"epoch": 6.423423423423423,
"grad_norm": 0.2840232732624716,
"learning_rate": 5e-06,
"loss": 0.1643,
"step": 713
},
{
"epoch": 6.4324324324324325,
"grad_norm": 0.31757015141649597,
"learning_rate": 5e-06,
"loss": 0.1622,
"step": 714
},
{
"epoch": 6.441441441441442,
"grad_norm": 0.28847324036631117,
"learning_rate": 5e-06,
"loss": 0.1365,
"step": 715
},
{
"epoch": 6.45045045045045,
"grad_norm": 0.24694988398848988,
"learning_rate": 5e-06,
"loss": 0.1594,
"step": 716
},
{
"epoch": 6.45945945945946,
"grad_norm": 0.29307213864672693,
"learning_rate": 5e-06,
"loss": 0.1411,
"step": 717
},
{
"epoch": 6.468468468468468,
"grad_norm": 0.30163977699200506,
"learning_rate": 5e-06,
"loss": 0.1649,
"step": 718
},
{
"epoch": 6.4774774774774775,
"grad_norm": 0.2854457863377953,
"learning_rate": 5e-06,
"loss": 0.1918,
"step": 719
},
{
"epoch": 6.486486486486487,
"grad_norm": 0.27342932900047295,
"learning_rate": 5e-06,
"loss": 0.176,
"step": 720
},
{
"epoch": 6.495495495495495,
"grad_norm": 0.28175783115173536,
"learning_rate": 5e-06,
"loss": 0.1422,
"step": 721
},
{
"epoch": 6.504504504504505,
"grad_norm": 0.2840989876099184,
"learning_rate": 5e-06,
"loss": 0.1633,
"step": 722
},
{
"epoch": 6.513513513513513,
"grad_norm": 0.2867793910350591,
"learning_rate": 5e-06,
"loss": 0.1796,
"step": 723
},
{
"epoch": 6.5225225225225225,
"grad_norm": 0.28428808863989385,
"learning_rate": 5e-06,
"loss": 0.1529,
"step": 724
},
{
"epoch": 6.531531531531532,
"grad_norm": 0.29279024558574074,
"learning_rate": 5e-06,
"loss": 0.1392,
"step": 725
},
{
"epoch": 6.54054054054054,
"grad_norm": 0.23321656651720726,
"learning_rate": 5e-06,
"loss": 0.1472,
"step": 726
},
{
"epoch": 6.54954954954955,
"grad_norm": 0.27521328727823563,
"learning_rate": 5e-06,
"loss": 0.1577,
"step": 727
},
{
"epoch": 6.558558558558558,
"grad_norm": 0.32541783429708115,
"learning_rate": 5e-06,
"loss": 0.1782,
"step": 728
},
{
"epoch": 6.5675675675675675,
"grad_norm": 0.4716962818206086,
"learning_rate": 5e-06,
"loss": 0.1247,
"step": 729
},
{
"epoch": 6.576576576576577,
"grad_norm": 0.26285903206886113,
"learning_rate": 5e-06,
"loss": 0.1744,
"step": 730
},
{
"epoch": 6.585585585585585,
"grad_norm": 0.30168633716148247,
"learning_rate": 5e-06,
"loss": 0.1422,
"step": 731
},
{
"epoch": 6.594594594594595,
"grad_norm": 0.2745027764141301,
"learning_rate": 5e-06,
"loss": 0.1672,
"step": 732
},
{
"epoch": 6.603603603603604,
"grad_norm": 0.35380479132918236,
"learning_rate": 5e-06,
"loss": 0.1663,
"step": 733
},
{
"epoch": 6.612612612612613,
"grad_norm": 0.27601944134435535,
"learning_rate": 5e-06,
"loss": 0.1695,
"step": 734
},
{
"epoch": 6.621621621621622,
"grad_norm": 0.2528980448808799,
"learning_rate": 5e-06,
"loss": 0.2017,
"step": 735
},
{
"epoch": 6.63063063063063,
"grad_norm": 0.30767286206094524,
"learning_rate": 5e-06,
"loss": 0.1261,
"step": 736
},
{
"epoch": 6.63963963963964,
"grad_norm": 0.2602137688236013,
"learning_rate": 5e-06,
"loss": 0.1636,
"step": 737
},
{
"epoch": 6.648648648648649,
"grad_norm": 0.23824598012350529,
"learning_rate": 5e-06,
"loss": 0.1584,
"step": 738
},
{
"epoch": 6.657657657657658,
"grad_norm": 0.29134756631872455,
"learning_rate": 5e-06,
"loss": 0.1835,
"step": 739
},
{
"epoch": 6.666666666666667,
"grad_norm": 0.25765774787058354,
"learning_rate": 5e-06,
"loss": 0.1603,
"step": 740
},
{
"epoch": 6.675675675675675,
"grad_norm": 0.2600078403016356,
"learning_rate": 5e-06,
"loss": 0.1826,
"step": 741
},
{
"epoch": 6.684684684684685,
"grad_norm": 0.2617835836231004,
"learning_rate": 5e-06,
"loss": 0.1464,
"step": 742
},
{
"epoch": 6.693693693693694,
"grad_norm": 0.32078684057749896,
"learning_rate": 5e-06,
"loss": 0.1252,
"step": 743
},
{
"epoch": 6.702702702702703,
"grad_norm": 0.29351670808174113,
"learning_rate": 5e-06,
"loss": 0.1548,
"step": 744
},
{
"epoch": 6.711711711711712,
"grad_norm": 0.30854251276850175,
"learning_rate": 5e-06,
"loss": 0.137,
"step": 745
},
{
"epoch": 6.7207207207207205,
"grad_norm": 0.26688862536435537,
"learning_rate": 5e-06,
"loss": 0.124,
"step": 746
},
{
"epoch": 6.72972972972973,
"grad_norm": 0.44923760580414157,
"learning_rate": 5e-06,
"loss": 0.1736,
"step": 747
},
{
"epoch": 6.738738738738739,
"grad_norm": 0.39218610199513526,
"learning_rate": 5e-06,
"loss": 0.1418,
"step": 748
},
{
"epoch": 6.747747747747748,
"grad_norm": 0.2664334967715308,
"learning_rate": 5e-06,
"loss": 0.1517,
"step": 749
},
{
"epoch": 6.756756756756757,
"grad_norm": 0.28834080697901254,
"learning_rate": 5e-06,
"loss": 0.1422,
"step": 750
},
{
"epoch": 6.7657657657657655,
"grad_norm": 0.29170783307220777,
"learning_rate": 5e-06,
"loss": 0.1466,
"step": 751
},
{
"epoch": 6.774774774774775,
"grad_norm": 0.2802071333171322,
"learning_rate": 5e-06,
"loss": 0.2085,
"step": 752
},
{
"epoch": 6.783783783783784,
"grad_norm": 0.27926590125257916,
"learning_rate": 5e-06,
"loss": 0.1923,
"step": 753
},
{
"epoch": 6.792792792792793,
"grad_norm": 0.26518681255237936,
"learning_rate": 5e-06,
"loss": 0.1766,
"step": 754
},
{
"epoch": 6.801801801801802,
"grad_norm": 0.347354240325402,
"learning_rate": 5e-06,
"loss": 0.1825,
"step": 755
},
{
"epoch": 6.8108108108108105,
"grad_norm": 0.2783190286987182,
"learning_rate": 5e-06,
"loss": 0.169,
"step": 756
},
{
"epoch": 6.81981981981982,
"grad_norm": 0.25398065438526435,
"learning_rate": 5e-06,
"loss": 0.1586,
"step": 757
},
{
"epoch": 6.828828828828829,
"grad_norm": 0.2677100544625917,
"learning_rate": 5e-06,
"loss": 0.1422,
"step": 758
},
{
"epoch": 6.837837837837838,
"grad_norm": 0.25767020061357093,
"learning_rate": 5e-06,
"loss": 0.1562,
"step": 759
},
{
"epoch": 6.846846846846847,
"grad_norm": 0.2741476785712207,
"learning_rate": 5e-06,
"loss": 0.1672,
"step": 760
},
{
"epoch": 6.8558558558558556,
"grad_norm": 0.3040893781704407,
"learning_rate": 5e-06,
"loss": 0.1571,
"step": 761
},
{
"epoch": 6.864864864864865,
"grad_norm": 0.3190900071250576,
"learning_rate": 5e-06,
"loss": 0.1424,
"step": 762
},
{
"epoch": 6.873873873873874,
"grad_norm": 0.29513021972878545,
"learning_rate": 5e-06,
"loss": 0.1553,
"step": 763
},
{
"epoch": 6.882882882882883,
"grad_norm": 0.24584895713037455,
"learning_rate": 5e-06,
"loss": 0.1648,
"step": 764
},
{
"epoch": 6.891891891891892,
"grad_norm": 0.29616482928194166,
"learning_rate": 5e-06,
"loss": 0.1628,
"step": 765
},
{
"epoch": 6.900900900900901,
"grad_norm": 0.2698717209619555,
"learning_rate": 5e-06,
"loss": 0.1486,
"step": 766
},
{
"epoch": 6.90990990990991,
"grad_norm": 0.2842949721560408,
"learning_rate": 5e-06,
"loss": 0.1533,
"step": 767
},
{
"epoch": 6.918918918918919,
"grad_norm": 0.2548658270502879,
"learning_rate": 5e-06,
"loss": 0.1828,
"step": 768
},
{
"epoch": 6.927927927927928,
"grad_norm": 0.29650593537088255,
"learning_rate": 5e-06,
"loss": 0.1609,
"step": 769
},
{
"epoch": 6.936936936936937,
"grad_norm": 0.28258003962728084,
"learning_rate": 5e-06,
"loss": 0.1593,
"step": 770
},
{
"epoch": 6.945945945945946,
"grad_norm": 0.2635611766361993,
"learning_rate": 5e-06,
"loss": 0.1854,
"step": 771
},
{
"epoch": 6.954954954954955,
"grad_norm": 0.30424832142174796,
"learning_rate": 5e-06,
"loss": 0.146,
"step": 772
},
{
"epoch": 6.963963963963964,
"grad_norm": 0.3060917294529799,
"learning_rate": 5e-06,
"loss": 0.1733,
"step": 773
},
{
"epoch": 6.972972972972973,
"grad_norm": 0.32441537080653826,
"learning_rate": 5e-06,
"loss": 0.1537,
"step": 774
},
{
"epoch": 6.981981981981982,
"grad_norm": 0.2765707057627649,
"learning_rate": 5e-06,
"loss": 0.1693,
"step": 775
},
{
"epoch": 6.990990990990991,
"grad_norm": 0.2603233373640257,
"learning_rate": 5e-06,
"loss": 0.1796,
"step": 776
},
{
"epoch": 7.0,
"grad_norm": 0.2781503743653767,
"learning_rate": 5e-06,
"loss": 0.184,
"step": 777
},
{
"epoch": 7.009009009009009,
"grad_norm": 0.28846916522682,
"learning_rate": 5e-06,
"loss": 0.1327,
"step": 778
},
{
"epoch": 7.018018018018018,
"grad_norm": 0.2909211896726909,
"learning_rate": 5e-06,
"loss": 0.1436,
"step": 779
},
{
"epoch": 7.027027027027027,
"grad_norm": 0.29059846677673873,
"learning_rate": 5e-06,
"loss": 0.1264,
"step": 780
},
{
"epoch": 7.036036036036036,
"grad_norm": 0.2589502571701869,
"learning_rate": 5e-06,
"loss": 0.1433,
"step": 781
},
{
"epoch": 7.045045045045045,
"grad_norm": 0.30299652950475636,
"learning_rate": 5e-06,
"loss": 0.1139,
"step": 782
},
{
"epoch": 7.054054054054054,
"grad_norm": 0.3100291804047275,
"learning_rate": 5e-06,
"loss": 0.1592,
"step": 783
},
{
"epoch": 7.063063063063063,
"grad_norm": 0.2975691545809747,
"learning_rate": 5e-06,
"loss": 0.157,
"step": 784
},
{
"epoch": 7.072072072072072,
"grad_norm": 0.3068091793086541,
"learning_rate": 5e-06,
"loss": 0.1232,
"step": 785
},
{
"epoch": 7.081081081081081,
"grad_norm": 0.3089806282089307,
"learning_rate": 5e-06,
"loss": 0.1221,
"step": 786
},
{
"epoch": 7.09009009009009,
"grad_norm": 0.2847038649852651,
"learning_rate": 5e-06,
"loss": 0.1281,
"step": 787
},
{
"epoch": 7.099099099099099,
"grad_norm": 0.2615997547703096,
"learning_rate": 5e-06,
"loss": 0.1434,
"step": 788
},
{
"epoch": 7.108108108108108,
"grad_norm": 0.2807779495824356,
"learning_rate": 5e-06,
"loss": 0.1245,
"step": 789
},
{
"epoch": 7.117117117117117,
"grad_norm": 0.32154910892821653,
"learning_rate": 5e-06,
"loss": 0.1398,
"step": 790
},
{
"epoch": 7.126126126126126,
"grad_norm": 0.2909515213375792,
"learning_rate": 5e-06,
"loss": 0.1617,
"step": 791
},
{
"epoch": 7.135135135135135,
"grad_norm": 0.3077346518188213,
"learning_rate": 5e-06,
"loss": 0.1798,
"step": 792
},
{
"epoch": 7.1441441441441444,
"grad_norm": 0.2950118643236569,
"learning_rate": 5e-06,
"loss": 0.1189,
"step": 793
},
{
"epoch": 7.153153153153153,
"grad_norm": 0.3358905154061822,
"learning_rate": 5e-06,
"loss": 0.1145,
"step": 794
},
{
"epoch": 7.162162162162162,
"grad_norm": 0.2784628731056912,
"learning_rate": 5e-06,
"loss": 0.1413,
"step": 795
},
{
"epoch": 7.171171171171171,
"grad_norm": 0.332241278578818,
"learning_rate": 5e-06,
"loss": 0.141,
"step": 796
},
{
"epoch": 7.18018018018018,
"grad_norm": 0.3319493746279513,
"learning_rate": 5e-06,
"loss": 0.1265,
"step": 797
},
{
"epoch": 7.1891891891891895,
"grad_norm": 0.2918711891065202,
"learning_rate": 5e-06,
"loss": 0.1848,
"step": 798
},
{
"epoch": 7.198198198198198,
"grad_norm": 0.294971929143932,
"learning_rate": 5e-06,
"loss": 0.1102,
"step": 799
},
{
"epoch": 7.207207207207207,
"grad_norm": 0.28582274936569596,
"learning_rate": 5e-06,
"loss": 0.1364,
"step": 800
},
{
"epoch": 7.216216216216216,
"grad_norm": 0.2659870012399625,
"learning_rate": 5e-06,
"loss": 0.1402,
"step": 801
},
{
"epoch": 7.225225225225225,
"grad_norm": 0.27334027202909716,
"learning_rate": 5e-06,
"loss": 0.16,
"step": 802
},
{
"epoch": 7.2342342342342345,
"grad_norm": 0.29814746528630565,
"learning_rate": 5e-06,
"loss": 0.1313,
"step": 803
},
{
"epoch": 7.243243243243243,
"grad_norm": 0.30947580830786586,
"learning_rate": 5e-06,
"loss": 0.16,
"step": 804
},
{
"epoch": 7.252252252252252,
"grad_norm": 0.38586634200713993,
"learning_rate": 5e-06,
"loss": 0.1162,
"step": 805
},
{
"epoch": 7.261261261261261,
"grad_norm": 0.24887533724067495,
"learning_rate": 5e-06,
"loss": 0.1632,
"step": 806
},
{
"epoch": 7.27027027027027,
"grad_norm": 0.26959801719398596,
"learning_rate": 5e-06,
"loss": 0.152,
"step": 807
},
{
"epoch": 7.2792792792792795,
"grad_norm": 0.29035577733855455,
"learning_rate": 5e-06,
"loss": 0.1283,
"step": 808
},
{
"epoch": 7.288288288288288,
"grad_norm": 0.28290473947861045,
"learning_rate": 5e-06,
"loss": 0.1411,
"step": 809
},
{
"epoch": 7.297297297297297,
"grad_norm": 0.32523778027288563,
"learning_rate": 5e-06,
"loss": 0.1198,
"step": 810
},
{
"epoch": 7.306306306306306,
"grad_norm": 0.27833761872914975,
"learning_rate": 5e-06,
"loss": 0.1555,
"step": 811
},
{
"epoch": 7.315315315315315,
"grad_norm": 0.3367892879965876,
"learning_rate": 5e-06,
"loss": 0.1314,
"step": 812
},
{
"epoch": 7.324324324324325,
"grad_norm": 0.277313010335673,
"learning_rate": 5e-06,
"loss": 0.1364,
"step": 813
},
{
"epoch": 7.333333333333333,
"grad_norm": 0.27375407683101277,
"learning_rate": 5e-06,
"loss": 0.1331,
"step": 814
},
{
"epoch": 7.342342342342342,
"grad_norm": 0.26697608186333877,
"learning_rate": 5e-06,
"loss": 0.1458,
"step": 815
},
{
"epoch": 7.351351351351352,
"grad_norm": 0.2974176115893814,
"learning_rate": 5e-06,
"loss": 0.1394,
"step": 816
},
{
"epoch": 7.36036036036036,
"grad_norm": 0.2764189750660692,
"learning_rate": 5e-06,
"loss": 0.1421,
"step": 817
},
{
"epoch": 7.36936936936937,
"grad_norm": 0.26103798171790754,
"learning_rate": 5e-06,
"loss": 0.1277,
"step": 818
},
{
"epoch": 7.378378378378378,
"grad_norm": 0.2702444951963583,
"learning_rate": 5e-06,
"loss": 0.1248,
"step": 819
},
{
"epoch": 7.387387387387387,
"grad_norm": 0.3091293494651806,
"learning_rate": 5e-06,
"loss": 0.107,
"step": 820
},
{
"epoch": 7.396396396396397,
"grad_norm": 0.27618979276963385,
"learning_rate": 5e-06,
"loss": 0.1286,
"step": 821
},
{
"epoch": 7.405405405405405,
"grad_norm": 0.2850325224623505,
"learning_rate": 5e-06,
"loss": 0.1524,
"step": 822
},
{
"epoch": 7.414414414414415,
"grad_norm": 0.2699678297509792,
"learning_rate": 5e-06,
"loss": 0.1504,
"step": 823
},
{
"epoch": 7.423423423423423,
"grad_norm": 0.2684653976073876,
"learning_rate": 5e-06,
"loss": 0.1472,
"step": 824
},
{
"epoch": 7.4324324324324325,
"grad_norm": 0.27562604313146905,
"learning_rate": 5e-06,
"loss": 0.1647,
"step": 825
},
{
"epoch": 7.441441441441442,
"grad_norm": 0.32271326740564915,
"learning_rate": 5e-06,
"loss": 0.1373,
"step": 826
},
{
"epoch": 7.45045045045045,
"grad_norm": 0.3017797868640937,
"learning_rate": 5e-06,
"loss": 0.1177,
"step": 827
},
{
"epoch": 7.45945945945946,
"grad_norm": 0.3746544705141892,
"learning_rate": 5e-06,
"loss": 0.1266,
"step": 828
},
{
"epoch": 7.468468468468468,
"grad_norm": 0.36162531031963435,
"learning_rate": 5e-06,
"loss": 0.1723,
"step": 829
},
{
"epoch": 7.4774774774774775,
"grad_norm": 0.30931420713180063,
"learning_rate": 5e-06,
"loss": 0.1497,
"step": 830
},
{
"epoch": 7.486486486486487,
"grad_norm": 0.27294359139491653,
"learning_rate": 5e-06,
"loss": 0.1409,
"step": 831
},
{
"epoch": 7.495495495495495,
"grad_norm": 0.2643297412103037,
"learning_rate": 5e-06,
"loss": 0.1339,
"step": 832
},
{
"epoch": 7.504504504504505,
"grad_norm": 0.27159435674152455,
"learning_rate": 5e-06,
"loss": 0.1432,
"step": 833
},
{
"epoch": 7.513513513513513,
"grad_norm": 0.29349995019825675,
"learning_rate": 5e-06,
"loss": 0.1431,
"step": 834
},
{
"epoch": 7.5225225225225225,
"grad_norm": 0.31813922083817525,
"learning_rate": 5e-06,
"loss": 0.1237,
"step": 835
},
{
"epoch": 7.531531531531532,
"grad_norm": 0.25431234598026253,
"learning_rate": 5e-06,
"loss": 0.1231,
"step": 836
},
{
"epoch": 7.54054054054054,
"grad_norm": 0.26549876685780915,
"learning_rate": 5e-06,
"loss": 0.1079,
"step": 837
},
{
"epoch": 7.54954954954955,
"grad_norm": 0.2843904679866454,
"learning_rate": 5e-06,
"loss": 0.1456,
"step": 838
},
{
"epoch": 7.558558558558558,
"grad_norm": 0.2764008229289936,
"learning_rate": 5e-06,
"loss": 0.1294,
"step": 839
},
{
"epoch": 7.5675675675675675,
"grad_norm": 0.27550196745644295,
"learning_rate": 5e-06,
"loss": 0.1169,
"step": 840
},
{
"epoch": 7.576576576576577,
"grad_norm": 0.2657966726978357,
"learning_rate": 5e-06,
"loss": 0.1283,
"step": 841
},
{
"epoch": 7.585585585585585,
"grad_norm": 0.279726882287188,
"learning_rate": 5e-06,
"loss": 0.1151,
"step": 842
},
{
"epoch": 7.594594594594595,
"grad_norm": 0.27489297214494474,
"learning_rate": 5e-06,
"loss": 0.1515,
"step": 843
},
{
"epoch": 7.603603603603604,
"grad_norm": 0.277774516155618,
"learning_rate": 5e-06,
"loss": 0.1503,
"step": 844
},
{
"epoch": 7.612612612612613,
"grad_norm": 0.27480641761427765,
"learning_rate": 5e-06,
"loss": 0.1075,
"step": 845
},
{
"epoch": 7.621621621621622,
"grad_norm": 0.2846350822817088,
"learning_rate": 5e-06,
"loss": 0.1514,
"step": 846
},
{
"epoch": 7.63063063063063,
"grad_norm": 0.28181647241406504,
"learning_rate": 5e-06,
"loss": 0.1314,
"step": 847
},
{
"epoch": 7.63963963963964,
"grad_norm": 0.3260999375199513,
"learning_rate": 5e-06,
"loss": 0.138,
"step": 848
},
{
"epoch": 7.648648648648649,
"grad_norm": 0.26191958014959504,
"learning_rate": 5e-06,
"loss": 0.1258,
"step": 849
},
{
"epoch": 7.657657657657658,
"grad_norm": 0.3229999390728544,
"learning_rate": 5e-06,
"loss": 0.15,
"step": 850
},
{
"epoch": 7.666666666666667,
"grad_norm": 0.26451999606394416,
"learning_rate": 5e-06,
"loss": 0.1495,
"step": 851
},
{
"epoch": 7.675675675675675,
"grad_norm": 0.26990766634628194,
"learning_rate": 5e-06,
"loss": 0.136,
"step": 852
},
{
"epoch": 7.684684684684685,
"grad_norm": 0.3024735158151713,
"learning_rate": 5e-06,
"loss": 0.1506,
"step": 853
},
{
"epoch": 7.693693693693694,
"grad_norm": 0.2924088710387093,
"learning_rate": 5e-06,
"loss": 0.1327,
"step": 854
},
{
"epoch": 7.702702702702703,
"grad_norm": 0.3127786707020021,
"learning_rate": 5e-06,
"loss": 0.139,
"step": 855
},
{
"epoch": 7.711711711711712,
"grad_norm": 0.284180680266691,
"learning_rate": 5e-06,
"loss": 0.144,
"step": 856
},
{
"epoch": 7.7207207207207205,
"grad_norm": 0.28516328758030296,
"learning_rate": 5e-06,
"loss": 0.1428,
"step": 857
},
{
"epoch": 7.72972972972973,
"grad_norm": 0.2862117655255194,
"learning_rate": 5e-06,
"loss": 0.1579,
"step": 858
},
{
"epoch": 7.738738738738739,
"grad_norm": 0.3006220321307271,
"learning_rate": 5e-06,
"loss": 0.1603,
"step": 859
},
{
"epoch": 7.747747747747748,
"grad_norm": 0.29806627084091325,
"learning_rate": 5e-06,
"loss": 0.138,
"step": 860
},
{
"epoch": 7.756756756756757,
"grad_norm": 0.2947193709313226,
"learning_rate": 5e-06,
"loss": 0.1523,
"step": 861
},
{
"epoch": 7.7657657657657655,
"grad_norm": 0.27653789730185635,
"learning_rate": 5e-06,
"loss": 0.0974,
"step": 862
},
{
"epoch": 7.774774774774775,
"grad_norm": 0.3359884991528938,
"learning_rate": 5e-06,
"loss": 0.1498,
"step": 863
},
{
"epoch": 7.783783783783784,
"grad_norm": 0.3548503598800355,
"learning_rate": 5e-06,
"loss": 0.1437,
"step": 864
},
{
"epoch": 7.792792792792793,
"grad_norm": 0.2847477764044701,
"learning_rate": 5e-06,
"loss": 0.1393,
"step": 865
},
{
"epoch": 7.801801801801802,
"grad_norm": 0.3065450470724209,
"learning_rate": 5e-06,
"loss": 0.1575,
"step": 866
},
{
"epoch": 7.8108108108108105,
"grad_norm": 0.3012473965068844,
"learning_rate": 5e-06,
"loss": 0.1032,
"step": 867
},
{
"epoch": 7.81981981981982,
"grad_norm": 0.3010781272384048,
"learning_rate": 5e-06,
"loss": 0.1475,
"step": 868
},
{
"epoch": 7.828828828828829,
"grad_norm": 0.2802542506009257,
"learning_rate": 5e-06,
"loss": 0.1493,
"step": 869
},
{
"epoch": 7.837837837837838,
"grad_norm": 0.257772116445583,
"learning_rate": 5e-06,
"loss": 0.1712,
"step": 870
},
{
"epoch": 7.846846846846847,
"grad_norm": 0.2619565806462764,
"learning_rate": 5e-06,
"loss": 0.1511,
"step": 871
},
{
"epoch": 7.8558558558558556,
"grad_norm": 0.24721830116730928,
"learning_rate": 5e-06,
"loss": 0.11,
"step": 872
},
{
"epoch": 7.864864864864865,
"grad_norm": 0.2608797590307874,
"learning_rate": 5e-06,
"loss": 0.1434,
"step": 873
},
{
"epoch": 7.873873873873874,
"grad_norm": 0.3098206461383644,
"learning_rate": 5e-06,
"loss": 0.1244,
"step": 874
},
{
"epoch": 7.882882882882883,
"grad_norm": 0.2543474293117471,
"learning_rate": 5e-06,
"loss": 0.1484,
"step": 875
},
{
"epoch": 7.891891891891892,
"grad_norm": 0.275008000690447,
"learning_rate": 5e-06,
"loss": 0.1866,
"step": 876
},
{
"epoch": 7.900900900900901,
"grad_norm": 0.2574884831625064,
"learning_rate": 5e-06,
"loss": 0.1163,
"step": 877
},
{
"epoch": 7.90990990990991,
"grad_norm": 0.2735186605433472,
"learning_rate": 5e-06,
"loss": 0.181,
"step": 878
},
{
"epoch": 7.918918918918919,
"grad_norm": 0.274445320782787,
"learning_rate": 5e-06,
"loss": 0.1501,
"step": 879
},
{
"epoch": 7.927927927927928,
"grad_norm": 0.32592488504503725,
"learning_rate": 5e-06,
"loss": 0.149,
"step": 880
},
{
"epoch": 7.936936936936937,
"grad_norm": 0.2733852933643102,
"learning_rate": 5e-06,
"loss": 0.1434,
"step": 881
},
{
"epoch": 7.945945945945946,
"grad_norm": 0.2801042118029067,
"learning_rate": 5e-06,
"loss": 0.1348,
"step": 882
},
{
"epoch": 7.954954954954955,
"grad_norm": 0.27111497067523815,
"learning_rate": 5e-06,
"loss": 0.1597,
"step": 883
},
{
"epoch": 7.963963963963964,
"grad_norm": 0.27209799660825884,
"learning_rate": 5e-06,
"loss": 0.1521,
"step": 884
},
{
"epoch": 7.972972972972973,
"grad_norm": 0.3930775839000158,
"learning_rate": 5e-06,
"loss": 0.1559,
"step": 885
},
{
"epoch": 7.981981981981982,
"grad_norm": 0.2678257713350625,
"learning_rate": 5e-06,
"loss": 0.1541,
"step": 886
},
{
"epoch": 7.990990990990991,
"grad_norm": 0.2557726554508295,
"learning_rate": 5e-06,
"loss": 0.1293,
"step": 887
},
{
"epoch": 8.0,
"grad_norm": 0.30361837110340506,
"learning_rate": 5e-06,
"loss": 0.118,
"step": 888
},
{
"epoch": 8.00900900900901,
"grad_norm": 0.3724300858125284,
"learning_rate": 5e-06,
"loss": 0.119,
"step": 889
},
{
"epoch": 8.018018018018019,
"grad_norm": 0.278602815740515,
"learning_rate": 5e-06,
"loss": 0.1553,
"step": 890
},
{
"epoch": 8.027027027027026,
"grad_norm": 0.2757271522278008,
"learning_rate": 5e-06,
"loss": 0.1182,
"step": 891
},
{
"epoch": 8.036036036036036,
"grad_norm": 0.2929517079994954,
"learning_rate": 5e-06,
"loss": 0.1158,
"step": 892
},
{
"epoch": 8.045045045045045,
"grad_norm": 0.36372793530477965,
"learning_rate": 5e-06,
"loss": 0.1134,
"step": 893
},
{
"epoch": 8.054054054054054,
"grad_norm": 0.2817436945034752,
"learning_rate": 5e-06,
"loss": 0.0886,
"step": 894
},
{
"epoch": 8.063063063063064,
"grad_norm": 0.26617606652724307,
"learning_rate": 5e-06,
"loss": 0.1196,
"step": 895
},
{
"epoch": 8.072072072072071,
"grad_norm": 0.28867983451275775,
"learning_rate": 5e-06,
"loss": 0.1194,
"step": 896
},
{
"epoch": 8.08108108108108,
"grad_norm": 0.32961696028266857,
"learning_rate": 5e-06,
"loss": 0.1137,
"step": 897
},
{
"epoch": 8.09009009009009,
"grad_norm": 0.3261806017299068,
"learning_rate": 5e-06,
"loss": 0.1164,
"step": 898
},
{
"epoch": 8.0990990990991,
"grad_norm": 0.3047004644766596,
"learning_rate": 5e-06,
"loss": 0.1126,
"step": 899
},
{
"epoch": 8.108108108108109,
"grad_norm": 0.33711404617474894,
"learning_rate": 5e-06,
"loss": 0.1321,
"step": 900
},
{
"epoch": 8.117117117117116,
"grad_norm": 0.28781948503164106,
"learning_rate": 5e-06,
"loss": 0.0933,
"step": 901
},
{
"epoch": 8.126126126126126,
"grad_norm": 0.28954810553500565,
"learning_rate": 5e-06,
"loss": 0.1232,
"step": 902
},
{
"epoch": 8.135135135135135,
"grad_norm": 0.2934734159929239,
"learning_rate": 5e-06,
"loss": 0.1197,
"step": 903
},
{
"epoch": 8.144144144144144,
"grad_norm": 0.2851801626338276,
"learning_rate": 5e-06,
"loss": 0.1371,
"step": 904
},
{
"epoch": 8.153153153153154,
"grad_norm": 0.27920553933946246,
"learning_rate": 5e-06,
"loss": 0.1166,
"step": 905
},
{
"epoch": 8.162162162162161,
"grad_norm": 0.31343774042661315,
"learning_rate": 5e-06,
"loss": 0.1351,
"step": 906
},
{
"epoch": 8.17117117117117,
"grad_norm": 0.28444182940880375,
"learning_rate": 5e-06,
"loss": 0.1102,
"step": 907
},
{
"epoch": 8.18018018018018,
"grad_norm": 0.27773525186278575,
"learning_rate": 5e-06,
"loss": 0.1051,
"step": 908
},
{
"epoch": 8.18918918918919,
"grad_norm": 0.2822074526087139,
"learning_rate": 5e-06,
"loss": 0.1147,
"step": 909
},
{
"epoch": 8.198198198198199,
"grad_norm": 0.3445008346134781,
"learning_rate": 5e-06,
"loss": 0.1627,
"step": 910
},
{
"epoch": 8.207207207207206,
"grad_norm": 0.2948133646560585,
"learning_rate": 5e-06,
"loss": 0.1243,
"step": 911
},
{
"epoch": 8.216216216216216,
"grad_norm": 0.3548137336190423,
"learning_rate": 5e-06,
"loss": 0.1034,
"step": 912
},
{
"epoch": 8.225225225225225,
"grad_norm": 0.24835233982448732,
"learning_rate": 5e-06,
"loss": 0.1083,
"step": 913
},
{
"epoch": 8.234234234234235,
"grad_norm": 0.28608021315126475,
"learning_rate": 5e-06,
"loss": 0.0924,
"step": 914
},
{
"epoch": 8.243243243243244,
"grad_norm": 0.2489920981375781,
"learning_rate": 5e-06,
"loss": 0.0732,
"step": 915
},
{
"epoch": 8.252252252252251,
"grad_norm": 0.32648292494227393,
"learning_rate": 5e-06,
"loss": 0.101,
"step": 916
},
{
"epoch": 8.26126126126126,
"grad_norm": 0.26393238433265215,
"learning_rate": 5e-06,
"loss": 0.129,
"step": 917
},
{
"epoch": 8.27027027027027,
"grad_norm": 0.33000321897677853,
"learning_rate": 5e-06,
"loss": 0.1092,
"step": 918
},
{
"epoch": 8.27927927927928,
"grad_norm": 0.36101227626943727,
"learning_rate": 5e-06,
"loss": 0.1164,
"step": 919
},
{
"epoch": 8.288288288288289,
"grad_norm": 0.31480014766124803,
"learning_rate": 5e-06,
"loss": 0.1042,
"step": 920
},
{
"epoch": 8.297297297297296,
"grad_norm": 0.3411446684131361,
"learning_rate": 5e-06,
"loss": 0.1165,
"step": 921
},
{
"epoch": 8.306306306306306,
"grad_norm": 0.332814645717202,
"learning_rate": 5e-06,
"loss": 0.1078,
"step": 922
},
{
"epoch": 8.315315315315315,
"grad_norm": 0.5298958645427678,
"learning_rate": 5e-06,
"loss": 0.129,
"step": 923
},
{
"epoch": 8.324324324324325,
"grad_norm": 0.359036677437228,
"learning_rate": 5e-06,
"loss": 0.1297,
"step": 924
},
{
"epoch": 8.333333333333334,
"grad_norm": 0.3038540419282529,
"learning_rate": 5e-06,
"loss": 0.0996,
"step": 925
},
{
"epoch": 8.342342342342342,
"grad_norm": 0.2836788711371018,
"learning_rate": 5e-06,
"loss": 0.1064,
"step": 926
},
{
"epoch": 8.35135135135135,
"grad_norm": 0.2840741884034298,
"learning_rate": 5e-06,
"loss": 0.1068,
"step": 927
},
{
"epoch": 8.36036036036036,
"grad_norm": 0.3606217340289843,
"learning_rate": 5e-06,
"loss": 0.1161,
"step": 928
},
{
"epoch": 8.36936936936937,
"grad_norm": 0.33639738597690616,
"learning_rate": 5e-06,
"loss": 0.1172,
"step": 929
},
{
"epoch": 8.378378378378379,
"grad_norm": 0.34030156462417316,
"learning_rate": 5e-06,
"loss": 0.0924,
"step": 930
},
{
"epoch": 8.387387387387387,
"grad_norm": 0.2833799915899549,
"learning_rate": 5e-06,
"loss": 0.1433,
"step": 931
},
{
"epoch": 8.396396396396396,
"grad_norm": 0.29693654573168704,
"learning_rate": 5e-06,
"loss": 0.1192,
"step": 932
},
{
"epoch": 8.405405405405405,
"grad_norm": 0.2834266663051093,
"learning_rate": 5e-06,
"loss": 0.1289,
"step": 933
},
{
"epoch": 8.414414414414415,
"grad_norm": 0.29926603005274255,
"learning_rate": 5e-06,
"loss": 0.1238,
"step": 934
},
{
"epoch": 8.423423423423424,
"grad_norm": 0.27131987409072494,
"learning_rate": 5e-06,
"loss": 0.1137,
"step": 935
},
{
"epoch": 8.432432432432432,
"grad_norm": 0.27245579434724637,
"learning_rate": 5e-06,
"loss": 0.0845,
"step": 936
},
{
"epoch": 8.441441441441441,
"grad_norm": 0.29790096192104826,
"learning_rate": 5e-06,
"loss": 0.1455,
"step": 937
},
{
"epoch": 8.45045045045045,
"grad_norm": 0.30112523923077406,
"learning_rate": 5e-06,
"loss": 0.1243,
"step": 938
},
{
"epoch": 8.45945945945946,
"grad_norm": 0.2975979835875494,
"learning_rate": 5e-06,
"loss": 0.1124,
"step": 939
},
{
"epoch": 8.468468468468469,
"grad_norm": 0.325062595930168,
"learning_rate": 5e-06,
"loss": 0.125,
"step": 940
},
{
"epoch": 8.477477477477478,
"grad_norm": 0.316039949527555,
"learning_rate": 5e-06,
"loss": 0.1423,
"step": 941
},
{
"epoch": 8.486486486486486,
"grad_norm": 0.38300001583678706,
"learning_rate": 5e-06,
"loss": 0.1402,
"step": 942
},
{
"epoch": 8.495495495495495,
"grad_norm": 0.368069280096857,
"learning_rate": 5e-06,
"loss": 0.1394,
"step": 943
},
{
"epoch": 8.504504504504505,
"grad_norm": 0.2836701503469006,
"learning_rate": 5e-06,
"loss": 0.1212,
"step": 944
},
{
"epoch": 8.513513513513514,
"grad_norm": 0.28208846723398523,
"learning_rate": 5e-06,
"loss": 0.1176,
"step": 945
},
{
"epoch": 8.522522522522522,
"grad_norm": 0.3223580674636738,
"learning_rate": 5e-06,
"loss": 0.1085,
"step": 946
},
{
"epoch": 8.531531531531531,
"grad_norm": 0.2975838495736659,
"learning_rate": 5e-06,
"loss": 0.1388,
"step": 947
},
{
"epoch": 8.54054054054054,
"grad_norm": 0.2926384544838666,
"learning_rate": 5e-06,
"loss": 0.1446,
"step": 948
},
{
"epoch": 8.54954954954955,
"grad_norm": 0.253328197509007,
"learning_rate": 5e-06,
"loss": 0.1106,
"step": 949
},
{
"epoch": 8.558558558558559,
"grad_norm": 0.29078679203113755,
"learning_rate": 5e-06,
"loss": 0.147,
"step": 950
},
{
"epoch": 8.567567567567568,
"grad_norm": 0.3242982227120515,
"learning_rate": 5e-06,
"loss": 0.114,
"step": 951
},
{
"epoch": 8.576576576576576,
"grad_norm": 0.31916745252108,
"learning_rate": 5e-06,
"loss": 0.1467,
"step": 952
},
{
"epoch": 8.585585585585585,
"grad_norm": 0.30713744803388243,
"learning_rate": 5e-06,
"loss": 0.1165,
"step": 953
},
{
"epoch": 8.594594594594595,
"grad_norm": 0.31006975320296604,
"learning_rate": 5e-06,
"loss": 0.1293,
"step": 954
},
{
"epoch": 8.603603603603604,
"grad_norm": 0.2929902011566849,
"learning_rate": 5e-06,
"loss": 0.1134,
"step": 955
},
{
"epoch": 8.612612612612612,
"grad_norm": 0.26235543384430693,
"learning_rate": 5e-06,
"loss": 0.1053,
"step": 956
},
{
"epoch": 8.621621621621621,
"grad_norm": 0.310009086101237,
"learning_rate": 5e-06,
"loss": 0.1353,
"step": 957
},
{
"epoch": 8.63063063063063,
"grad_norm": 0.4012614816736551,
"learning_rate": 5e-06,
"loss": 0.1248,
"step": 958
},
{
"epoch": 8.63963963963964,
"grad_norm": 0.2922213629865694,
"learning_rate": 5e-06,
"loss": 0.1229,
"step": 959
},
{
"epoch": 8.64864864864865,
"grad_norm": 0.28311443461185315,
"learning_rate": 5e-06,
"loss": 0.1309,
"step": 960
},
{
"epoch": 8.657657657657658,
"grad_norm": 0.3882325365638355,
"learning_rate": 5e-06,
"loss": 0.134,
"step": 961
},
{
"epoch": 8.666666666666666,
"grad_norm": 0.3162063136020091,
"learning_rate": 5e-06,
"loss": 0.1166,
"step": 962
},
{
"epoch": 8.675675675675675,
"grad_norm": 0.3408953528226783,
"learning_rate": 5e-06,
"loss": 0.1289,
"step": 963
},
{
"epoch": 8.684684684684685,
"grad_norm": 0.2558993641119161,
"learning_rate": 5e-06,
"loss": 0.0915,
"step": 964
},
{
"epoch": 8.693693693693694,
"grad_norm": 0.3240415851794722,
"learning_rate": 5e-06,
"loss": 0.1283,
"step": 965
},
{
"epoch": 8.702702702702704,
"grad_norm": 0.2802378425767391,
"learning_rate": 5e-06,
"loss": 0.1462,
"step": 966
},
{
"epoch": 8.711711711711711,
"grad_norm": 0.32417830620793386,
"learning_rate": 5e-06,
"loss": 0.1332,
"step": 967
},
{
"epoch": 8.72072072072072,
"grad_norm": 0.27199769909957633,
"learning_rate": 5e-06,
"loss": 0.111,
"step": 968
},
{
"epoch": 8.72972972972973,
"grad_norm": 0.3143617324505027,
"learning_rate": 5e-06,
"loss": 0.1215,
"step": 969
},
{
"epoch": 8.73873873873874,
"grad_norm": 0.3294346785496534,
"learning_rate": 5e-06,
"loss": 0.1054,
"step": 970
},
{
"epoch": 8.747747747747749,
"grad_norm": 0.30155526964094,
"learning_rate": 5e-06,
"loss": 0.1222,
"step": 971
},
{
"epoch": 8.756756756756756,
"grad_norm": 0.3117950940446377,
"learning_rate": 5e-06,
"loss": 0.114,
"step": 972
},
{
"epoch": 8.765765765765765,
"grad_norm": 0.3392300144846105,
"learning_rate": 5e-06,
"loss": 0.1163,
"step": 973
},
{
"epoch": 8.774774774774775,
"grad_norm": 0.2977388674828091,
"learning_rate": 5e-06,
"loss": 0.1386,
"step": 974
},
{
"epoch": 8.783783783783784,
"grad_norm": 0.3088070527312149,
"learning_rate": 5e-06,
"loss": 0.1622,
"step": 975
},
{
"epoch": 8.792792792792794,
"grad_norm": 0.28460989793331587,
"learning_rate": 5e-06,
"loss": 0.1321,
"step": 976
},
{
"epoch": 8.801801801801801,
"grad_norm": 0.301114864385651,
"learning_rate": 5e-06,
"loss": 0.1084,
"step": 977
},
{
"epoch": 8.81081081081081,
"grad_norm": 0.3080454311504172,
"learning_rate": 5e-06,
"loss": 0.0958,
"step": 978
},
{
"epoch": 8.81981981981982,
"grad_norm": 0.35110800631668737,
"learning_rate": 5e-06,
"loss": 0.1272,
"step": 979
},
{
"epoch": 8.82882882882883,
"grad_norm": 0.27956217271886274,
"learning_rate": 5e-06,
"loss": 0.1327,
"step": 980
},
{
"epoch": 8.837837837837839,
"grad_norm": 0.32361648737642695,
"learning_rate": 5e-06,
"loss": 0.1214,
"step": 981
},
{
"epoch": 8.846846846846846,
"grad_norm": 0.33228900753392643,
"learning_rate": 5e-06,
"loss": 0.0843,
"step": 982
},
{
"epoch": 8.855855855855856,
"grad_norm": 0.32323655451004957,
"learning_rate": 5e-06,
"loss": 0.117,
"step": 983
},
{
"epoch": 8.864864864864865,
"grad_norm": 0.3411596228576446,
"learning_rate": 5e-06,
"loss": 0.1137,
"step": 984
},
{
"epoch": 8.873873873873874,
"grad_norm": 0.33618399554078643,
"learning_rate": 5e-06,
"loss": 0.1232,
"step": 985
},
{
"epoch": 8.882882882882884,
"grad_norm": 0.31438715504842607,
"learning_rate": 5e-06,
"loss": 0.1188,
"step": 986
},
{
"epoch": 8.891891891891891,
"grad_norm": 0.29235691269480235,
"learning_rate": 5e-06,
"loss": 0.0907,
"step": 987
},
{
"epoch": 8.9009009009009,
"grad_norm": 0.31569994309412647,
"learning_rate": 5e-06,
"loss": 0.139,
"step": 988
},
{
"epoch": 8.90990990990991,
"grad_norm": 0.3488819032640533,
"learning_rate": 5e-06,
"loss": 0.0925,
"step": 989
},
{
"epoch": 8.91891891891892,
"grad_norm": 0.3287782461836467,
"learning_rate": 5e-06,
"loss": 0.1144,
"step": 990
},
{
"epoch": 8.927927927927929,
"grad_norm": 0.342018883415981,
"learning_rate": 5e-06,
"loss": 0.1554,
"step": 991
},
{
"epoch": 8.936936936936936,
"grad_norm": 0.30922097124521764,
"learning_rate": 5e-06,
"loss": 0.0773,
"step": 992
},
{
"epoch": 8.945945945945946,
"grad_norm": 0.3170605884389048,
"learning_rate": 5e-06,
"loss": 0.0961,
"step": 993
},
{
"epoch": 8.954954954954955,
"grad_norm": 0.2894394712507756,
"learning_rate": 5e-06,
"loss": 0.1175,
"step": 994
},
{
"epoch": 8.963963963963964,
"grad_norm": 0.3507612251253632,
"learning_rate": 5e-06,
"loss": 0.0898,
"step": 995
},
{
"epoch": 8.972972972972974,
"grad_norm": 0.2916461407037756,
"learning_rate": 5e-06,
"loss": 0.1162,
"step": 996
},
{
"epoch": 8.981981981981981,
"grad_norm": 0.3148022366204299,
"learning_rate": 5e-06,
"loss": 0.1263,
"step": 997
},
{
"epoch": 8.99099099099099,
"grad_norm": 0.2917019835417808,
"learning_rate": 5e-06,
"loss": 0.1088,
"step": 998
},
{
"epoch": 9.0,
"grad_norm": 0.27462543957892144,
"learning_rate": 5e-06,
"loss": 0.1158,
"step": 999
}
],
"logging_steps": 1.0,
"max_steps": 11100,
"num_input_tokens_seen": 0,
"num_train_epochs": 100,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1278638161920000.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}