DeepDream2045's picture
Training in progress, step 294, checkpoint
5f23047 verified
raw
history blame
52.1 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.2500797448165869,
"eval_steps": 294,
"global_step": 294,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008506113769271664,
"grad_norm": 0.7357208728790283,
"learning_rate": 2e-05,
"loss": 2.8145,
"step": 1
},
{
"epoch": 0.0017012227538543328,
"grad_norm": 0.771299421787262,
"learning_rate": 4e-05,
"loss": 3.195,
"step": 2
},
{
"epoch": 0.002551834130781499,
"grad_norm": 0.7344720363616943,
"learning_rate": 6e-05,
"loss": 2.8861,
"step": 3
},
{
"epoch": 0.0034024455077086655,
"grad_norm": 0.7500324845314026,
"learning_rate": 8e-05,
"loss": 2.7421,
"step": 4
},
{
"epoch": 0.004253056884635832,
"grad_norm": 0.9078495502471924,
"learning_rate": 0.0001,
"loss": 2.9622,
"step": 5
},
{
"epoch": 0.005103668261562998,
"grad_norm": 1.0794708728790283,
"learning_rate": 0.00012,
"loss": 3.1124,
"step": 6
},
{
"epoch": 0.005954279638490165,
"grad_norm": 1.0218361616134644,
"learning_rate": 0.00014,
"loss": 2.7233,
"step": 7
},
{
"epoch": 0.006804891015417331,
"grad_norm": 1.059141755104065,
"learning_rate": 0.00016,
"loss": 2.8784,
"step": 8
},
{
"epoch": 0.007655502392344498,
"grad_norm": 0.4901650547981262,
"learning_rate": 0.00018,
"loss": 2.6192,
"step": 9
},
{
"epoch": 0.008506113769271665,
"grad_norm": 0.8344933390617371,
"learning_rate": 0.0002,
"loss": 2.6448,
"step": 10
},
{
"epoch": 0.00935672514619883,
"grad_norm": 1.5278894901275635,
"learning_rate": 0.00019999963702861705,
"loss": 2.7457,
"step": 11
},
{
"epoch": 0.010207336523125997,
"grad_norm": 1.2650033235549927,
"learning_rate": 0.00019999854811710317,
"loss": 2.7532,
"step": 12
},
{
"epoch": 0.011057947900053162,
"grad_norm": 0.740222156047821,
"learning_rate": 0.0001999967332733632,
"loss": 2.6836,
"step": 13
},
{
"epoch": 0.01190855927698033,
"grad_norm": 0.49257639050483704,
"learning_rate": 0.0001999941925105719,
"loss": 2.6658,
"step": 14
},
{
"epoch": 0.012759170653907496,
"grad_norm": 0.3310573399066925,
"learning_rate": 0.00019999092584717374,
"loss": 2.5043,
"step": 15
},
{
"epoch": 0.013609782030834662,
"grad_norm": 0.33361560106277466,
"learning_rate": 0.00019998693330688282,
"loss": 2.6252,
"step": 16
},
{
"epoch": 0.014460393407761828,
"grad_norm": 0.4449865221977234,
"learning_rate": 0.00019998221491868273,
"loss": 2.648,
"step": 17
},
{
"epoch": 0.015311004784688996,
"grad_norm": 0.4820970892906189,
"learning_rate": 0.0001999767707168262,
"loss": 2.7337,
"step": 18
},
{
"epoch": 0.01616161616161616,
"grad_norm": 0.5144203901290894,
"learning_rate": 0.0001999706007408351,
"loss": 2.6967,
"step": 19
},
{
"epoch": 0.01701222753854333,
"grad_norm": 0.501557469367981,
"learning_rate": 0.0001999637050354999,
"loss": 2.7318,
"step": 20
},
{
"epoch": 0.017862838915470493,
"grad_norm": 0.4480394423007965,
"learning_rate": 0.00019995608365087946,
"loss": 2.4126,
"step": 21
},
{
"epoch": 0.01871345029239766,
"grad_norm": 0.4459284842014313,
"learning_rate": 0.00019994773664230064,
"loss": 2.7072,
"step": 22
},
{
"epoch": 0.01956406166932483,
"grad_norm": 0.39909827709198,
"learning_rate": 0.00019993866407035798,
"loss": 2.6358,
"step": 23
},
{
"epoch": 0.020414673046251993,
"grad_norm": 0.36802783608436584,
"learning_rate": 0.0001999288660009132,
"loss": 2.6751,
"step": 24
},
{
"epoch": 0.02126528442317916,
"grad_norm": 0.43287962675094604,
"learning_rate": 0.0001999183425050946,
"loss": 2.7518,
"step": 25
},
{
"epoch": 0.022115895800106325,
"grad_norm": 0.4289425313472748,
"learning_rate": 0.00019990709365929677,
"loss": 2.7535,
"step": 26
},
{
"epoch": 0.022966507177033493,
"grad_norm": 0.4627043604850769,
"learning_rate": 0.00019989511954517992,
"loss": 2.8111,
"step": 27
},
{
"epoch": 0.02381711855396066,
"grad_norm": 0.4823961853981018,
"learning_rate": 0.00019988242024966923,
"loss": 2.9493,
"step": 28
},
{
"epoch": 0.024667729930887825,
"grad_norm": 0.4622437059879303,
"learning_rate": 0.00019986899586495432,
"loss": 2.788,
"step": 29
},
{
"epoch": 0.025518341307814992,
"grad_norm": 0.4963669776916504,
"learning_rate": 0.00019985484648848853,
"loss": 2.8304,
"step": 30
},
{
"epoch": 0.02636895268474216,
"grad_norm": 0.47957557439804077,
"learning_rate": 0.00019983997222298828,
"loss": 2.7323,
"step": 31
},
{
"epoch": 0.027219564061669324,
"grad_norm": 0.445528507232666,
"learning_rate": 0.00019982437317643217,
"loss": 3.015,
"step": 32
},
{
"epoch": 0.028070175438596492,
"grad_norm": 0.46085312962532043,
"learning_rate": 0.00019980804946206036,
"loss": 2.8556,
"step": 33
},
{
"epoch": 0.028920786815523656,
"grad_norm": 0.5078282356262207,
"learning_rate": 0.0001997910011983737,
"loss": 2.8472,
"step": 34
},
{
"epoch": 0.029771398192450824,
"grad_norm": 0.4612430930137634,
"learning_rate": 0.00019977322850913283,
"loss": 2.6399,
"step": 35
},
{
"epoch": 0.03062200956937799,
"grad_norm": 0.499965101480484,
"learning_rate": 0.00019975473152335726,
"loss": 2.9121,
"step": 36
},
{
"epoch": 0.03147262094630516,
"grad_norm": 0.5101069808006287,
"learning_rate": 0.0001997355103753246,
"loss": 2.8488,
"step": 37
},
{
"epoch": 0.03232323232323232,
"grad_norm": 0.5065872669219971,
"learning_rate": 0.00019971556520456929,
"loss": 2.8311,
"step": 38
},
{
"epoch": 0.03317384370015949,
"grad_norm": 0.5324426889419556,
"learning_rate": 0.00019969489615588189,
"loss": 2.7454,
"step": 39
},
{
"epoch": 0.03402445507708666,
"grad_norm": 0.5128815770149231,
"learning_rate": 0.0001996735033793079,
"loss": 2.8116,
"step": 40
},
{
"epoch": 0.03487506645401382,
"grad_norm": 0.5330538153648376,
"learning_rate": 0.00019965138703014655,
"loss": 2.7584,
"step": 41
},
{
"epoch": 0.03572567783094099,
"grad_norm": 0.556816577911377,
"learning_rate": 0.00019962854726894997,
"loss": 2.8902,
"step": 42
},
{
"epoch": 0.03657628920786816,
"grad_norm": 0.5452866554260254,
"learning_rate": 0.0001996049842615217,
"loss": 2.7984,
"step": 43
},
{
"epoch": 0.03742690058479532,
"grad_norm": 0.5836021304130554,
"learning_rate": 0.0001995806981789157,
"loss": 2.803,
"step": 44
},
{
"epoch": 0.03827751196172249,
"grad_norm": 0.5968561172485352,
"learning_rate": 0.00019955568919743507,
"loss": 2.8592,
"step": 45
},
{
"epoch": 0.03912812333864966,
"grad_norm": 0.6416970491409302,
"learning_rate": 0.0001995299574986306,
"loss": 2.7488,
"step": 46
},
{
"epoch": 0.03997873471557682,
"grad_norm": 0.704325795173645,
"learning_rate": 0.0001995035032692998,
"loss": 2.6983,
"step": 47
},
{
"epoch": 0.040829346092503986,
"grad_norm": 0.7766572833061218,
"learning_rate": 0.00019947632670148517,
"loss": 2.9677,
"step": 48
},
{
"epoch": 0.04167995746943115,
"grad_norm": 0.7186003923416138,
"learning_rate": 0.00019944842799247308,
"loss": 3.0728,
"step": 49
},
{
"epoch": 0.04253056884635832,
"grad_norm": 0.7572959065437317,
"learning_rate": 0.00019941980734479214,
"loss": 3.0345,
"step": 50
},
{
"epoch": 0.043381180223285486,
"grad_norm": 0.48461732268333435,
"learning_rate": 0.00019939046496621194,
"loss": 2.6307,
"step": 51
},
{
"epoch": 0.04423179160021265,
"grad_norm": 0.468675434589386,
"learning_rate": 0.0001993604010697413,
"loss": 2.4616,
"step": 52
},
{
"epoch": 0.04508240297713982,
"grad_norm": 0.3815957009792328,
"learning_rate": 0.0001993296158736269,
"loss": 2.7479,
"step": 53
},
{
"epoch": 0.045933014354066985,
"grad_norm": 0.3313361704349518,
"learning_rate": 0.00019929810960135172,
"loss": 2.4983,
"step": 54
},
{
"epoch": 0.04678362573099415,
"grad_norm": 0.32521429657936096,
"learning_rate": 0.00019926588248163316,
"loss": 2.5446,
"step": 55
},
{
"epoch": 0.04763423710792132,
"grad_norm": 0.2972453236579895,
"learning_rate": 0.00019923293474842174,
"loss": 2.5472,
"step": 56
},
{
"epoch": 0.048484848484848485,
"grad_norm": 0.2972238063812256,
"learning_rate": 0.00019919926664089909,
"loss": 2.5389,
"step": 57
},
{
"epoch": 0.04933545986177565,
"grad_norm": 0.27498453855514526,
"learning_rate": 0.00019916487840347644,
"loss": 2.571,
"step": 58
},
{
"epoch": 0.05018607123870282,
"grad_norm": 0.2938655614852905,
"learning_rate": 0.00019912977028579268,
"loss": 2.7134,
"step": 59
},
{
"epoch": 0.051036682615629984,
"grad_norm": 0.26742392778396606,
"learning_rate": 0.0001990939425427127,
"loss": 2.5632,
"step": 60
},
{
"epoch": 0.05188729399255715,
"grad_norm": 0.28117692470550537,
"learning_rate": 0.00019905739543432536,
"loss": 2.5297,
"step": 61
},
{
"epoch": 0.05273790536948432,
"grad_norm": 0.28916725516319275,
"learning_rate": 0.00019902012922594177,
"loss": 2.7096,
"step": 62
},
{
"epoch": 0.053588516746411484,
"grad_norm": 0.32468459010124207,
"learning_rate": 0.0001989821441880933,
"loss": 2.6192,
"step": 63
},
{
"epoch": 0.05443912812333865,
"grad_norm": 0.2806537449359894,
"learning_rate": 0.0001989434405965295,
"loss": 2.6747,
"step": 64
},
{
"epoch": 0.05528973950026582,
"grad_norm": 0.2876998782157898,
"learning_rate": 0.0001989040187322164,
"loss": 2.7443,
"step": 65
},
{
"epoch": 0.056140350877192984,
"grad_norm": 0.27619123458862305,
"learning_rate": 0.00019886387888133413,
"loss": 2.7379,
"step": 66
},
{
"epoch": 0.05699096225412015,
"grad_norm": 0.31479549407958984,
"learning_rate": 0.000198823021335275,
"loss": 2.4039,
"step": 67
},
{
"epoch": 0.05784157363104731,
"grad_norm": 0.300857812166214,
"learning_rate": 0.00019878144639064144,
"loss": 2.5705,
"step": 68
},
{
"epoch": 0.05869218500797448,
"grad_norm": 0.3776433765888214,
"learning_rate": 0.00019873915434924375,
"loss": 2.863,
"step": 69
},
{
"epoch": 0.05954279638490165,
"grad_norm": 0.30585938692092896,
"learning_rate": 0.00019869614551809795,
"loss": 2.5312,
"step": 70
},
{
"epoch": 0.06039340776182881,
"grad_norm": 0.3163856267929077,
"learning_rate": 0.00019865242020942353,
"loss": 2.8491,
"step": 71
},
{
"epoch": 0.06124401913875598,
"grad_norm": 0.30077147483825684,
"learning_rate": 0.00019860797874064122,
"loss": 2.7777,
"step": 72
},
{
"epoch": 0.06209463051568315,
"grad_norm": 0.4153176248073578,
"learning_rate": 0.0001985628214343706,
"loss": 2.7499,
"step": 73
},
{
"epoch": 0.06294524189261032,
"grad_norm": 0.35611122846603394,
"learning_rate": 0.00019851694861842793,
"loss": 2.7089,
"step": 74
},
{
"epoch": 0.06379585326953748,
"grad_norm": 0.3143812417984009,
"learning_rate": 0.00019847036062582357,
"loss": 2.758,
"step": 75
},
{
"epoch": 0.06464646464646465,
"grad_norm": 0.32024794816970825,
"learning_rate": 0.00019842305779475968,
"loss": 2.4616,
"step": 76
},
{
"epoch": 0.06549707602339182,
"grad_norm": 0.3146126866340637,
"learning_rate": 0.00019837504046862775,
"loss": 2.6104,
"step": 77
},
{
"epoch": 0.06634768740031897,
"grad_norm": 0.32578444480895996,
"learning_rate": 0.00019832630899600608,
"loss": 2.6297,
"step": 78
},
{
"epoch": 0.06719829877724615,
"grad_norm": 0.36873045563697815,
"learning_rate": 0.00019827686373065728,
"loss": 2.6358,
"step": 79
},
{
"epoch": 0.06804891015417332,
"grad_norm": 0.3558378517627716,
"learning_rate": 0.00019822670503152567,
"loss": 2.6308,
"step": 80
},
{
"epoch": 0.06889952153110047,
"grad_norm": 0.37967684864997864,
"learning_rate": 0.00019817583326273467,
"loss": 2.7577,
"step": 81
},
{
"epoch": 0.06975013290802765,
"grad_norm": 0.3737669885158539,
"learning_rate": 0.00019812424879358425,
"loss": 2.9207,
"step": 82
},
{
"epoch": 0.07060074428495482,
"grad_norm": 0.39410829544067383,
"learning_rate": 0.0001980719519985481,
"loss": 2.9544,
"step": 83
},
{
"epoch": 0.07145135566188197,
"grad_norm": 0.3863750696182251,
"learning_rate": 0.00019801894325727104,
"loss": 2.7794,
"step": 84
},
{
"epoch": 0.07230196703880915,
"grad_norm": 0.4226458966732025,
"learning_rate": 0.0001979652229545662,
"loss": 2.7491,
"step": 85
},
{
"epoch": 0.07315257841573632,
"grad_norm": 0.42758506536483765,
"learning_rate": 0.0001979107914804122,
"loss": 2.8524,
"step": 86
},
{
"epoch": 0.07400318979266347,
"grad_norm": 0.4379200041294098,
"learning_rate": 0.0001978556492299504,
"loss": 2.6526,
"step": 87
},
{
"epoch": 0.07485380116959064,
"grad_norm": 0.44331902265548706,
"learning_rate": 0.000197799796603482,
"loss": 2.8028,
"step": 88
},
{
"epoch": 0.07570441254651782,
"grad_norm": 0.4358711540699005,
"learning_rate": 0.0001977432340064651,
"loss": 2.5426,
"step": 89
},
{
"epoch": 0.07655502392344497,
"grad_norm": 0.45511335134506226,
"learning_rate": 0.00019768596184951173,
"loss": 2.7067,
"step": 90
},
{
"epoch": 0.07740563530037214,
"grad_norm": 0.5394377112388611,
"learning_rate": 0.00019762798054838502,
"loss": 2.8189,
"step": 91
},
{
"epoch": 0.07825624667729932,
"grad_norm": 0.5124706625938416,
"learning_rate": 0.00019756929052399603,
"loss": 2.7702,
"step": 92
},
{
"epoch": 0.07910685805422647,
"grad_norm": 0.5025349855422974,
"learning_rate": 0.00019750989220240073,
"loss": 2.6872,
"step": 93
},
{
"epoch": 0.07995746943115364,
"grad_norm": 0.5144663453102112,
"learning_rate": 0.00019744978601479694,
"loss": 2.6366,
"step": 94
},
{
"epoch": 0.08080808080808081,
"grad_norm": 0.5908443927764893,
"learning_rate": 0.00019738897239752118,
"loss": 2.7918,
"step": 95
},
{
"epoch": 0.08165869218500797,
"grad_norm": 0.6398508548736572,
"learning_rate": 0.00019732745179204552,
"loss": 2.9972,
"step": 96
},
{
"epoch": 0.08250930356193514,
"grad_norm": 0.6032273173332214,
"learning_rate": 0.00019726522464497435,
"loss": 2.7638,
"step": 97
},
{
"epoch": 0.0833599149388623,
"grad_norm": 0.6310097575187683,
"learning_rate": 0.0001972022914080411,
"loss": 2.9328,
"step": 98
},
{
"epoch": 0.08421052631578947,
"grad_norm": 0.7050711512565613,
"learning_rate": 0.00019713865253810506,
"loss": 2.8143,
"step": 99
},
{
"epoch": 0.08506113769271664,
"grad_norm": 0.755136251449585,
"learning_rate": 0.00019707430849714807,
"loss": 3.036,
"step": 100
},
{
"epoch": 0.0859117490696438,
"grad_norm": 0.35153907537460327,
"learning_rate": 0.00019700925975227096,
"loss": 2.4444,
"step": 101
},
{
"epoch": 0.08676236044657097,
"grad_norm": 0.40153488516807556,
"learning_rate": 0.0001969435067756904,
"loss": 2.6068,
"step": 102
},
{
"epoch": 0.08761297182349814,
"grad_norm": 0.3474213480949402,
"learning_rate": 0.00019687705004473545,
"loss": 2.4261,
"step": 103
},
{
"epoch": 0.0884635832004253,
"grad_norm": 0.3283519744873047,
"learning_rate": 0.00019680989004184382,
"loss": 2.6736,
"step": 104
},
{
"epoch": 0.08931419457735247,
"grad_norm": 0.29034170508384705,
"learning_rate": 0.00019674202725455877,
"loss": 2.5551,
"step": 105
},
{
"epoch": 0.09016480595427964,
"grad_norm": 0.2918970584869385,
"learning_rate": 0.00019667346217552527,
"loss": 2.6039,
"step": 106
},
{
"epoch": 0.0910154173312068,
"grad_norm": 0.2852106988430023,
"learning_rate": 0.00019660419530248655,
"loss": 2.5432,
"step": 107
},
{
"epoch": 0.09186602870813397,
"grad_norm": 0.30997323989868164,
"learning_rate": 0.0001965342271382805,
"loss": 2.7324,
"step": 108
},
{
"epoch": 0.09271664008506114,
"grad_norm": 0.34156399965286255,
"learning_rate": 0.00019646355819083589,
"loss": 2.6548,
"step": 109
},
{
"epoch": 0.0935672514619883,
"grad_norm": 0.2763843238353729,
"learning_rate": 0.00019639218897316883,
"loss": 2.5254,
"step": 110
},
{
"epoch": 0.09441786283891547,
"grad_norm": 0.2835611402988434,
"learning_rate": 0.00019632012000337908,
"loss": 2.5677,
"step": 111
},
{
"epoch": 0.09526847421584264,
"grad_norm": 0.2940271198749542,
"learning_rate": 0.00019624735180464602,
"loss": 2.5976,
"step": 112
},
{
"epoch": 0.0961190855927698,
"grad_norm": 0.2714485824108124,
"learning_rate": 0.00019617388490522517,
"loss": 2.6087,
"step": 113
},
{
"epoch": 0.09696969696969697,
"grad_norm": 0.30371204018592834,
"learning_rate": 0.00019609971983844412,
"loss": 2.6129,
"step": 114
},
{
"epoch": 0.09782030834662414,
"grad_norm": 0.2762625813484192,
"learning_rate": 0.0001960248571426989,
"loss": 2.5759,
"step": 115
},
{
"epoch": 0.0986709197235513,
"grad_norm": 0.2702981233596802,
"learning_rate": 0.00019594929736144976,
"loss": 2.5443,
"step": 116
},
{
"epoch": 0.09952153110047847,
"grad_norm": 0.29210978746414185,
"learning_rate": 0.00019587304104321746,
"loss": 2.6425,
"step": 117
},
{
"epoch": 0.10037214247740564,
"grad_norm": 0.31620749831199646,
"learning_rate": 0.00019579608874157928,
"loss": 2.703,
"step": 118
},
{
"epoch": 0.1012227538543328,
"grad_norm": 0.2803102433681488,
"learning_rate": 0.00019571844101516484,
"loss": 2.6886,
"step": 119
},
{
"epoch": 0.10207336523125997,
"grad_norm": 0.30169349908828735,
"learning_rate": 0.00019564009842765225,
"loss": 2.8221,
"step": 120
},
{
"epoch": 0.10292397660818714,
"grad_norm": 0.297553151845932,
"learning_rate": 0.00019556106154776379,
"loss": 2.6897,
"step": 121
},
{
"epoch": 0.1037745879851143,
"grad_norm": 0.30721086263656616,
"learning_rate": 0.000195481330949262,
"loss": 2.6551,
"step": 122
},
{
"epoch": 0.10462519936204147,
"grad_norm": 0.29124605655670166,
"learning_rate": 0.00019540090721094542,
"loss": 2.6292,
"step": 123
},
{
"epoch": 0.10547581073896864,
"grad_norm": 0.31037285923957825,
"learning_rate": 0.0001953197909166443,
"loss": 2.5459,
"step": 124
},
{
"epoch": 0.1063264221158958,
"grad_norm": 0.3543750047683716,
"learning_rate": 0.00019523798265521654,
"loss": 2.5622,
"step": 125
},
{
"epoch": 0.10717703349282297,
"grad_norm": 0.3356544077396393,
"learning_rate": 0.00019515548302054335,
"loss": 2.7272,
"step": 126
},
{
"epoch": 0.10802764486975014,
"grad_norm": 0.34296396374702454,
"learning_rate": 0.00019507229261152476,
"loss": 2.6629,
"step": 127
},
{
"epoch": 0.1088782562466773,
"grad_norm": 0.34629112482070923,
"learning_rate": 0.0001949884120320756,
"loss": 2.6371,
"step": 128
},
{
"epoch": 0.10972886762360447,
"grad_norm": 0.34170377254486084,
"learning_rate": 0.00019490384189112082,
"loss": 2.7218,
"step": 129
},
{
"epoch": 0.11057947900053164,
"grad_norm": 0.38438230752944946,
"learning_rate": 0.0001948185828025913,
"loss": 2.7096,
"step": 130
},
{
"epoch": 0.1114300903774588,
"grad_norm": 0.40347060561180115,
"learning_rate": 0.00019473263538541914,
"loss": 2.8129,
"step": 131
},
{
"epoch": 0.11228070175438597,
"grad_norm": 0.3742891848087311,
"learning_rate": 0.00019464600026353348,
"loss": 2.7916,
"step": 132
},
{
"epoch": 0.11313131313131314,
"grad_norm": 0.4015231430530548,
"learning_rate": 0.0001945586780658557,
"loss": 2.6099,
"step": 133
},
{
"epoch": 0.1139819245082403,
"grad_norm": 0.40618133544921875,
"learning_rate": 0.00019447066942629491,
"loss": 2.6669,
"step": 134
},
{
"epoch": 0.11483253588516747,
"grad_norm": 0.4171842932701111,
"learning_rate": 0.00019438197498374357,
"loss": 2.6272,
"step": 135
},
{
"epoch": 0.11568314726209462,
"grad_norm": 0.443013995885849,
"learning_rate": 0.0001942925953820725,
"loss": 2.5722,
"step": 136
},
{
"epoch": 0.1165337586390218,
"grad_norm": 0.4636158347129822,
"learning_rate": 0.00019420253127012645,
"loss": 2.8075,
"step": 137
},
{
"epoch": 0.11738437001594897,
"grad_norm": 0.4271916151046753,
"learning_rate": 0.00019411178330171937,
"loss": 2.6875,
"step": 138
},
{
"epoch": 0.11823498139287612,
"grad_norm": 0.47826603055000305,
"learning_rate": 0.00019402035213562954,
"loss": 2.7042,
"step": 139
},
{
"epoch": 0.1190855927698033,
"grad_norm": 0.46729791164398193,
"learning_rate": 0.0001939282384355949,
"loss": 2.6663,
"step": 140
},
{
"epoch": 0.11993620414673047,
"grad_norm": 0.4689824879169464,
"learning_rate": 0.0001938354428703082,
"loss": 2.6138,
"step": 141
},
{
"epoch": 0.12078681552365762,
"grad_norm": 0.526096522808075,
"learning_rate": 0.0001937419661134121,
"loss": 2.9258,
"step": 142
},
{
"epoch": 0.1216374269005848,
"grad_norm": 0.5075511932373047,
"learning_rate": 0.0001936478088434944,
"loss": 2.8021,
"step": 143
},
{
"epoch": 0.12248803827751197,
"grad_norm": 0.5048439502716064,
"learning_rate": 0.00019355297174408298,
"loss": 2.6274,
"step": 144
},
{
"epoch": 0.12333864965443912,
"grad_norm": 0.5787357687950134,
"learning_rate": 0.00019345745550364087,
"loss": 2.851,
"step": 145
},
{
"epoch": 0.1241892610313663,
"grad_norm": 0.5641311407089233,
"learning_rate": 0.00019336126081556134,
"loss": 2.7681,
"step": 146
},
{
"epoch": 0.12503987240829345,
"grad_norm": 0.5504147410392761,
"learning_rate": 0.00019326438837816276,
"loss": 2.6905,
"step": 147
},
{
"epoch": 0.12589048378522064,
"grad_norm": 0.6101283431053162,
"learning_rate": 0.00019316683889468358,
"loss": 2.589,
"step": 148
},
{
"epoch": 0.1267410951621478,
"grad_norm": 0.7153661847114563,
"learning_rate": 0.00019306861307327725,
"loss": 2.9563,
"step": 149
},
{
"epoch": 0.12759170653907495,
"grad_norm": 0.7049738168716431,
"learning_rate": 0.00019296971162700694,
"loss": 2.8023,
"step": 150
},
{
"epoch": 0.12844231791600214,
"grad_norm": 0.3282754421234131,
"learning_rate": 0.00019287013527384062,
"loss": 2.4278,
"step": 151
},
{
"epoch": 0.1292929292929293,
"grad_norm": 0.350577712059021,
"learning_rate": 0.00019276988473664557,
"loss": 2.5845,
"step": 152
},
{
"epoch": 0.13014354066985645,
"grad_norm": 0.32433176040649414,
"learning_rate": 0.00019266896074318334,
"loss": 2.6126,
"step": 153
},
{
"epoch": 0.13099415204678364,
"grad_norm": 0.31844663619995117,
"learning_rate": 0.00019256736402610436,
"loss": 2.527,
"step": 154
},
{
"epoch": 0.1318447634237108,
"grad_norm": 0.2559802830219269,
"learning_rate": 0.00019246509532294266,
"loss": 2.2437,
"step": 155
},
{
"epoch": 0.13269537480063795,
"grad_norm": 0.28512275218963623,
"learning_rate": 0.00019236215537611046,
"loss": 2.5739,
"step": 156
},
{
"epoch": 0.13354598617756513,
"grad_norm": 0.26634740829467773,
"learning_rate": 0.00019225854493289286,
"loss": 2.4485,
"step": 157
},
{
"epoch": 0.1343965975544923,
"grad_norm": 0.2785400450229645,
"learning_rate": 0.0001921542647454424,
"loss": 2.7944,
"step": 158
},
{
"epoch": 0.13524720893141945,
"grad_norm": 0.27485981583595276,
"learning_rate": 0.00019204931557077355,
"loss": 2.6518,
"step": 159
},
{
"epoch": 0.13609782030834663,
"grad_norm": 0.2687318027019501,
"learning_rate": 0.00019194369817075724,
"loss": 2.6595,
"step": 160
},
{
"epoch": 0.1369484316852738,
"grad_norm": 0.26418977975845337,
"learning_rate": 0.00019183741331211537,
"loss": 2.7045,
"step": 161
},
{
"epoch": 0.13779904306220095,
"grad_norm": 0.28258347511291504,
"learning_rate": 0.00019173046176641513,
"loss": 2.5896,
"step": 162
},
{
"epoch": 0.13864965443912813,
"grad_norm": 0.27390146255493164,
"learning_rate": 0.00019162284431006358,
"loss": 2.5566,
"step": 163
},
{
"epoch": 0.1395002658160553,
"grad_norm": 0.2916048765182495,
"learning_rate": 0.00019151456172430183,
"loss": 2.609,
"step": 164
},
{
"epoch": 0.14035087719298245,
"grad_norm": 0.30684247612953186,
"learning_rate": 0.00019140561479519955,
"loss": 2.5222,
"step": 165
},
{
"epoch": 0.14120148856990963,
"grad_norm": 0.26836761832237244,
"learning_rate": 0.00019129600431364897,
"loss": 2.5891,
"step": 166
},
{
"epoch": 0.1420520999468368,
"grad_norm": 0.2658300995826721,
"learning_rate": 0.00019118573107535953,
"loss": 2.644,
"step": 167
},
{
"epoch": 0.14290271132376395,
"grad_norm": 0.2789425551891327,
"learning_rate": 0.00019107479588085182,
"loss": 2.5641,
"step": 168
},
{
"epoch": 0.14375332270069113,
"grad_norm": 0.2909972071647644,
"learning_rate": 0.00019096319953545185,
"loss": 2.5982,
"step": 169
},
{
"epoch": 0.1446039340776183,
"grad_norm": 0.3741363286972046,
"learning_rate": 0.0001908509428492852,
"loss": 2.6293,
"step": 170
},
{
"epoch": 0.14545454545454545,
"grad_norm": 0.2989426851272583,
"learning_rate": 0.0001907380266372712,
"loss": 2.7364,
"step": 171
},
{
"epoch": 0.14630515683147263,
"grad_norm": 0.28862622380256653,
"learning_rate": 0.00019062445171911686,
"loss": 2.5656,
"step": 172
},
{
"epoch": 0.1471557682083998,
"grad_norm": 0.3215920329093933,
"learning_rate": 0.0001905102189193112,
"loss": 2.8443,
"step": 173
},
{
"epoch": 0.14800637958532695,
"grad_norm": 0.2994636595249176,
"learning_rate": 0.00019039532906711882,
"loss": 2.7014,
"step": 174
},
{
"epoch": 0.14885699096225413,
"grad_norm": 0.32109183073043823,
"learning_rate": 0.00019027978299657436,
"loss": 2.8364,
"step": 175
},
{
"epoch": 0.1497076023391813,
"grad_norm": 0.30813783407211304,
"learning_rate": 0.00019016358154647618,
"loss": 2.5102,
"step": 176
},
{
"epoch": 0.15055821371610845,
"grad_norm": 0.32674533128738403,
"learning_rate": 0.00019004672556038028,
"loss": 2.757,
"step": 177
},
{
"epoch": 0.15140882509303563,
"grad_norm": 0.34680357575416565,
"learning_rate": 0.00018992921588659422,
"loss": 2.5228,
"step": 178
},
{
"epoch": 0.1522594364699628,
"grad_norm": 0.35170817375183105,
"learning_rate": 0.00018981105337817104,
"loss": 2.6148,
"step": 179
},
{
"epoch": 0.15311004784688995,
"grad_norm": 0.3741483986377716,
"learning_rate": 0.00018969223889290284,
"loss": 2.8025,
"step": 180
},
{
"epoch": 0.15396065922381713,
"grad_norm": 0.4156269431114197,
"learning_rate": 0.00018957277329331485,
"loss": 2.72,
"step": 181
},
{
"epoch": 0.1548112706007443,
"grad_norm": 0.3726477324962616,
"learning_rate": 0.00018945265744665886,
"loss": 2.6197,
"step": 182
},
{
"epoch": 0.15566188197767145,
"grad_norm": 0.4135706424713135,
"learning_rate": 0.00018933189222490726,
"loss": 2.7176,
"step": 183
},
{
"epoch": 0.15651249335459863,
"grad_norm": 0.38799911737442017,
"learning_rate": 0.00018921047850474642,
"loss": 2.5641,
"step": 184
},
{
"epoch": 0.1573631047315258,
"grad_norm": 0.4622843265533447,
"learning_rate": 0.00018908841716757042,
"loss": 2.7626,
"step": 185
},
{
"epoch": 0.15821371610845295,
"grad_norm": 0.4251146912574768,
"learning_rate": 0.00018896570909947475,
"loss": 2.6842,
"step": 186
},
{
"epoch": 0.15906432748538013,
"grad_norm": 0.4628697335720062,
"learning_rate": 0.00018884235519124972,
"loss": 2.9476,
"step": 187
},
{
"epoch": 0.1599149388623073,
"grad_norm": 0.5052159428596497,
"learning_rate": 0.0001887183563383741,
"loss": 2.769,
"step": 188
},
{
"epoch": 0.16076555023923444,
"grad_norm": 0.4817435145378113,
"learning_rate": 0.00018859371344100864,
"loss": 2.6266,
"step": 189
},
{
"epoch": 0.16161616161616163,
"grad_norm": 0.4751468598842621,
"learning_rate": 0.0001884684274039894,
"loss": 2.877,
"step": 190
},
{
"epoch": 0.1624667729930888,
"grad_norm": 0.5826165676116943,
"learning_rate": 0.00018834249913682132,
"loss": 2.7308,
"step": 191
},
{
"epoch": 0.16331738437001594,
"grad_norm": 0.5441760420799255,
"learning_rate": 0.00018821592955367154,
"loss": 2.6764,
"step": 192
},
{
"epoch": 0.1641679957469431,
"grad_norm": 0.5005947947502136,
"learning_rate": 0.00018808871957336275,
"loss": 2.664,
"step": 193
},
{
"epoch": 0.1650186071238703,
"grad_norm": 0.5205551981925964,
"learning_rate": 0.00018796087011936665,
"loss": 2.6192,
"step": 194
},
{
"epoch": 0.16586921850079744,
"grad_norm": 0.5489931106567383,
"learning_rate": 0.0001878323821197971,
"loss": 2.5061,
"step": 195
},
{
"epoch": 0.1667198298777246,
"grad_norm": 0.5525840520858765,
"learning_rate": 0.00018770325650740345,
"loss": 2.7474,
"step": 196
},
{
"epoch": 0.1675704412546518,
"grad_norm": 0.5978725552558899,
"learning_rate": 0.0001875734942195637,
"loss": 2.6055,
"step": 197
},
{
"epoch": 0.16842105263157894,
"grad_norm": 0.6148700714111328,
"learning_rate": 0.0001874430961982778,
"loss": 2.8352,
"step": 198
},
{
"epoch": 0.1692716640085061,
"grad_norm": 0.5956620573997498,
"learning_rate": 0.0001873120633901608,
"loss": 2.7367,
"step": 199
},
{
"epoch": 0.17012227538543329,
"grad_norm": 0.7082740664482117,
"learning_rate": 0.0001871803967464358,
"loss": 2.9437,
"step": 200
},
{
"epoch": 0.17097288676236044,
"grad_norm": 0.32244405150413513,
"learning_rate": 0.00018704809722292737,
"loss": 2.3835,
"step": 201
},
{
"epoch": 0.1718234981392876,
"grad_norm": 0.3367772102355957,
"learning_rate": 0.00018691516578005427,
"loss": 2.601,
"step": 202
},
{
"epoch": 0.17267410951621479,
"grad_norm": 0.31732872128486633,
"learning_rate": 0.00018678160338282272,
"loss": 2.5894,
"step": 203
},
{
"epoch": 0.17352472089314194,
"grad_norm": 0.27467650175094604,
"learning_rate": 0.0001866474110008193,
"loss": 2.4369,
"step": 204
},
{
"epoch": 0.1743753322700691,
"grad_norm": 0.29726937413215637,
"learning_rate": 0.00018651258960820385,
"loss": 2.6123,
"step": 205
},
{
"epoch": 0.17522594364699628,
"grad_norm": 0.27499106526374817,
"learning_rate": 0.00018637714018370253,
"loss": 2.5141,
"step": 206
},
{
"epoch": 0.17607655502392344,
"grad_norm": 0.27535390853881836,
"learning_rate": 0.00018624106371060067,
"loss": 2.5148,
"step": 207
},
{
"epoch": 0.1769271664008506,
"grad_norm": 0.2687024176120758,
"learning_rate": 0.00018610436117673555,
"loss": 2.6057,
"step": 208
},
{
"epoch": 0.17777777777777778,
"grad_norm": 0.31320950388908386,
"learning_rate": 0.00018596703357448934,
"loss": 2.6813,
"step": 209
},
{
"epoch": 0.17862838915470494,
"grad_norm": 0.25832033157348633,
"learning_rate": 0.00018582908190078185,
"loss": 2.4898,
"step": 210
},
{
"epoch": 0.1794790005316321,
"grad_norm": 0.2806166410446167,
"learning_rate": 0.00018569050715706325,
"loss": 2.5762,
"step": 211
},
{
"epoch": 0.18032961190855928,
"grad_norm": 0.26099708676338196,
"learning_rate": 0.00018555131034930685,
"loss": 2.5386,
"step": 212
},
{
"epoch": 0.18118022328548644,
"grad_norm": 0.26140880584716797,
"learning_rate": 0.00018541149248800184,
"loss": 2.7159,
"step": 213
},
{
"epoch": 0.1820308346624136,
"grad_norm": 0.2698177695274353,
"learning_rate": 0.0001852710545881459,
"loss": 2.5942,
"step": 214
},
{
"epoch": 0.18288144603934078,
"grad_norm": 0.27240726351737976,
"learning_rate": 0.00018512999766923772,
"loss": 2.5377,
"step": 215
},
{
"epoch": 0.18373205741626794,
"grad_norm": 0.2780822813510895,
"learning_rate": 0.00018498832275526988,
"loss": 2.6185,
"step": 216
},
{
"epoch": 0.1845826687931951,
"grad_norm": 0.2713901400566101,
"learning_rate": 0.00018484603087472109,
"loss": 2.5802,
"step": 217
},
{
"epoch": 0.18543328017012228,
"grad_norm": 0.2843954265117645,
"learning_rate": 0.000184703123060549,
"loss": 2.6404,
"step": 218
},
{
"epoch": 0.18628389154704944,
"grad_norm": 0.2679051160812378,
"learning_rate": 0.0001845596003501826,
"loss": 2.6688,
"step": 219
},
{
"epoch": 0.1871345029239766,
"grad_norm": 0.292568176984787,
"learning_rate": 0.00018441546378551458,
"loss": 2.6505,
"step": 220
},
{
"epoch": 0.18798511430090378,
"grad_norm": 0.282326877117157,
"learning_rate": 0.00018427071441289388,
"loss": 2.6299,
"step": 221
},
{
"epoch": 0.18883572567783094,
"grad_norm": 0.2853985130786896,
"learning_rate": 0.00018412535328311814,
"loss": 2.8143,
"step": 222
},
{
"epoch": 0.1896863370547581,
"grad_norm": 0.2786814868450165,
"learning_rate": 0.00018397938145142591,
"loss": 2.6007,
"step": 223
},
{
"epoch": 0.19053694843168528,
"grad_norm": 0.42460358142852783,
"learning_rate": 0.0001838327999774892,
"loss": 2.7891,
"step": 224
},
{
"epoch": 0.19138755980861244,
"grad_norm": 0.30478838086128235,
"learning_rate": 0.00018368560992540562,
"loss": 2.4551,
"step": 225
},
{
"epoch": 0.1922381711855396,
"grad_norm": 0.3402044177055359,
"learning_rate": 0.00018353781236369064,
"loss": 2.9191,
"step": 226
},
{
"epoch": 0.19308878256246678,
"grad_norm": 0.33662521839141846,
"learning_rate": 0.00018338940836527004,
"loss": 2.5606,
"step": 227
},
{
"epoch": 0.19393939393939394,
"grad_norm": 0.34461426734924316,
"learning_rate": 0.0001832403990074719,
"loss": 2.714,
"step": 228
},
{
"epoch": 0.1947900053163211,
"grad_norm": 0.342184454202652,
"learning_rate": 0.0001830907853720188,
"loss": 2.6936,
"step": 229
},
{
"epoch": 0.19564061669324828,
"grad_norm": 0.3557281494140625,
"learning_rate": 0.0001829405685450202,
"loss": 2.6663,
"step": 230
},
{
"epoch": 0.19649122807017544,
"grad_norm": 0.38674700260162354,
"learning_rate": 0.0001827897496169642,
"loss": 2.7257,
"step": 231
},
{
"epoch": 0.1973418394471026,
"grad_norm": 0.3849089741706848,
"learning_rate": 0.00018263832968271,
"loss": 2.7178,
"step": 232
},
{
"epoch": 0.19819245082402978,
"grad_norm": 0.4508901834487915,
"learning_rate": 0.00018248630984147955,
"loss": 2.7947,
"step": 233
},
{
"epoch": 0.19904306220095694,
"grad_norm": 0.39502936601638794,
"learning_rate": 0.00018233369119684996,
"loss": 2.5885,
"step": 234
},
{
"epoch": 0.1998936735778841,
"grad_norm": 0.4287837743759155,
"learning_rate": 0.00018218047485674523,
"loss": 2.6911,
"step": 235
},
{
"epoch": 0.20074428495481128,
"grad_norm": 0.4257849454879761,
"learning_rate": 0.00018202666193342833,
"loss": 2.8803,
"step": 236
},
{
"epoch": 0.20159489633173844,
"grad_norm": 0.4459477961063385,
"learning_rate": 0.00018187225354349295,
"loss": 2.8352,
"step": 237
},
{
"epoch": 0.2024455077086656,
"grad_norm": 0.4430312514305115,
"learning_rate": 0.0001817172508078557,
"loss": 2.7517,
"step": 238
},
{
"epoch": 0.20329611908559278,
"grad_norm": 0.4465429484844208,
"learning_rate": 0.00018156165485174773,
"loss": 2.7119,
"step": 239
},
{
"epoch": 0.20414673046251994,
"grad_norm": 0.4532601833343506,
"learning_rate": 0.00018140546680470659,
"loss": 2.7346,
"step": 240
},
{
"epoch": 0.2049973418394471,
"grad_norm": 0.4750036299228668,
"learning_rate": 0.00018124868780056814,
"loss": 2.6113,
"step": 241
},
{
"epoch": 0.20584795321637428,
"grad_norm": 0.5072234272956848,
"learning_rate": 0.00018109131897745822,
"loss": 2.844,
"step": 242
},
{
"epoch": 0.20669856459330144,
"grad_norm": 0.5094662308692932,
"learning_rate": 0.00018093336147778438,
"loss": 2.7737,
"step": 243
},
{
"epoch": 0.2075491759702286,
"grad_norm": 0.606842577457428,
"learning_rate": 0.00018077481644822768,
"loss": 2.6153,
"step": 244
},
{
"epoch": 0.20839978734715578,
"grad_norm": 0.5311163067817688,
"learning_rate": 0.00018061568503973435,
"loss": 2.6038,
"step": 245
},
{
"epoch": 0.20925039872408294,
"grad_norm": 0.5758761167526245,
"learning_rate": 0.00018045596840750723,
"loss": 2.6446,
"step": 246
},
{
"epoch": 0.2101010101010101,
"grad_norm": 0.598297119140625,
"learning_rate": 0.00018029566771099776,
"loss": 2.7002,
"step": 247
},
{
"epoch": 0.21095162147793728,
"grad_norm": 0.6635774970054626,
"learning_rate": 0.00018013478411389716,
"loss": 2.8011,
"step": 248
},
{
"epoch": 0.21180223285486444,
"grad_norm": 0.6850919723510742,
"learning_rate": 0.00017997331878412835,
"loss": 2.8903,
"step": 249
},
{
"epoch": 0.2126528442317916,
"grad_norm": 0.7298348546028137,
"learning_rate": 0.00017981127289383716,
"loss": 2.9483,
"step": 250
},
{
"epoch": 0.21350345560871878,
"grad_norm": 0.33354559540748596,
"learning_rate": 0.00017964864761938404,
"loss": 2.4727,
"step": 251
},
{
"epoch": 0.21435406698564594,
"grad_norm": 0.3557465374469757,
"learning_rate": 0.00017948544414133534,
"loss": 2.5058,
"step": 252
},
{
"epoch": 0.2152046783625731,
"grad_norm": 0.3230442702770233,
"learning_rate": 0.00017932166364445498,
"loss": 2.5422,
"step": 253
},
{
"epoch": 0.21605528973950028,
"grad_norm": 0.28668278455734253,
"learning_rate": 0.0001791573073176956,
"loss": 2.3173,
"step": 254
},
{
"epoch": 0.21690590111642744,
"grad_norm": 0.30019721388816833,
"learning_rate": 0.00017899237635419002,
"loss": 2.6444,
"step": 255
},
{
"epoch": 0.2177565124933546,
"grad_norm": 0.285314679145813,
"learning_rate": 0.0001788268719512427,
"loss": 2.5319,
"step": 256
},
{
"epoch": 0.21860712387028178,
"grad_norm": 0.27584996819496155,
"learning_rate": 0.00017866079531032088,
"loss": 2.6496,
"step": 257
},
{
"epoch": 0.21945773524720893,
"grad_norm": 0.2874069809913635,
"learning_rate": 0.0001784941476370459,
"loss": 2.5156,
"step": 258
},
{
"epoch": 0.2203083466241361,
"grad_norm": 0.26786255836486816,
"learning_rate": 0.00017832693014118448,
"loss": 2.6211,
"step": 259
},
{
"epoch": 0.22115895800106328,
"grad_norm": 0.2633914351463318,
"learning_rate": 0.0001781591440366399,
"loss": 2.5811,
"step": 260
},
{
"epoch": 0.22200956937799043,
"grad_norm": 0.2724866569042206,
"learning_rate": 0.00017799079054144334,
"loss": 2.5904,
"step": 261
},
{
"epoch": 0.2228601807549176,
"grad_norm": 0.29333001375198364,
"learning_rate": 0.00017782187087774477,
"loss": 2.7581,
"step": 262
},
{
"epoch": 0.22371079213184478,
"grad_norm": 0.2735550105571747,
"learning_rate": 0.00017765238627180424,
"loss": 2.7114,
"step": 263
},
{
"epoch": 0.22456140350877193,
"grad_norm": 0.2721397280693054,
"learning_rate": 0.00017748233795398307,
"loss": 2.5991,
"step": 264
},
{
"epoch": 0.2254120148856991,
"grad_norm": 0.25755858421325684,
"learning_rate": 0.0001773117271587346,
"loss": 2.5786,
"step": 265
},
{
"epoch": 0.22626262626262628,
"grad_norm": 0.25772804021835327,
"learning_rate": 0.00017714055512459565,
"loss": 2.488,
"step": 266
},
{
"epoch": 0.22711323763955343,
"grad_norm": 0.2766227424144745,
"learning_rate": 0.0001769688230941772,
"loss": 2.8924,
"step": 267
},
{
"epoch": 0.2279638490164806,
"grad_norm": 0.26846593618392944,
"learning_rate": 0.00017679653231415552,
"loss": 2.5783,
"step": 268
},
{
"epoch": 0.22881446039340775,
"grad_norm": 0.26374372839927673,
"learning_rate": 0.00017662368403526302,
"loss": 2.4675,
"step": 269
},
{
"epoch": 0.22966507177033493,
"grad_norm": 0.28237268328666687,
"learning_rate": 0.0001764502795122793,
"loss": 2.5994,
"step": 270
},
{
"epoch": 0.2305156831472621,
"grad_norm": 0.2786102890968323,
"learning_rate": 0.00017627632000402193,
"loss": 2.514,
"step": 271
},
{
"epoch": 0.23136629452418925,
"grad_norm": 0.27646180987358093,
"learning_rate": 0.00017610180677333739,
"loss": 2.5673,
"step": 272
},
{
"epoch": 0.23221690590111643,
"grad_norm": 0.3052549660205841,
"learning_rate": 0.00017592674108709186,
"loss": 2.5345,
"step": 273
},
{
"epoch": 0.2330675172780436,
"grad_norm": 0.30554690957069397,
"learning_rate": 0.00017575112421616202,
"loss": 2.709,
"step": 274
},
{
"epoch": 0.23391812865497075,
"grad_norm": 0.3219161331653595,
"learning_rate": 0.00017557495743542585,
"loss": 2.6825,
"step": 275
},
{
"epoch": 0.23476874003189793,
"grad_norm": 0.31834957003593445,
"learning_rate": 0.0001753982420237533,
"loss": 2.7017,
"step": 276
},
{
"epoch": 0.2356193514088251,
"grad_norm": 0.30264872312545776,
"learning_rate": 0.00017522097926399722,
"loss": 2.3725,
"step": 277
},
{
"epoch": 0.23646996278575225,
"grad_norm": 0.3283548951148987,
"learning_rate": 0.00017504317044298367,
"loss": 2.6217,
"step": 278
},
{
"epoch": 0.23732057416267943,
"grad_norm": 0.33564746379852295,
"learning_rate": 0.00017486481685150302,
"loss": 2.5738,
"step": 279
},
{
"epoch": 0.2381711855396066,
"grad_norm": 0.37258434295654297,
"learning_rate": 0.0001746859197843002,
"loss": 2.783,
"step": 280
},
{
"epoch": 0.23902179691653375,
"grad_norm": 0.3897363245487213,
"learning_rate": 0.0001745064805400656,
"loss": 2.7908,
"step": 281
},
{
"epoch": 0.23987240829346093,
"grad_norm": 0.3756699562072754,
"learning_rate": 0.00017432650042142536,
"loss": 2.5944,
"step": 282
},
{
"epoch": 0.2407230196703881,
"grad_norm": 0.3787755072116852,
"learning_rate": 0.00017414598073493216,
"loss": 2.7574,
"step": 283
},
{
"epoch": 0.24157363104731525,
"grad_norm": 0.38891106843948364,
"learning_rate": 0.0001739649227910556,
"loss": 2.8635,
"step": 284
},
{
"epoch": 0.24242424242424243,
"grad_norm": 0.40293633937835693,
"learning_rate": 0.00017378332790417273,
"loss": 2.729,
"step": 285
},
{
"epoch": 0.2432748538011696,
"grad_norm": 0.414109468460083,
"learning_rate": 0.00017360119739255852,
"loss": 2.6077,
"step": 286
},
{
"epoch": 0.24412546517809675,
"grad_norm": 0.42549028992652893,
"learning_rate": 0.0001734185325783762,
"loss": 2.7812,
"step": 287
},
{
"epoch": 0.24497607655502393,
"grad_norm": 0.42882055044174194,
"learning_rate": 0.00017323533478766777,
"loss": 2.7653,
"step": 288
},
{
"epoch": 0.2458266879319511,
"grad_norm": 0.42119139432907104,
"learning_rate": 0.00017305160535034436,
"loss": 2.5355,
"step": 289
},
{
"epoch": 0.24667729930887825,
"grad_norm": 0.4749990999698639,
"learning_rate": 0.0001728673456001766,
"loss": 2.7885,
"step": 290
},
{
"epoch": 0.24752791068580543,
"grad_norm": 0.4682268500328064,
"learning_rate": 0.00017268255687478469,
"loss": 2.6402,
"step": 291
},
{
"epoch": 0.2483785220627326,
"grad_norm": 0.4854019284248352,
"learning_rate": 0.00017249724051562906,
"loss": 2.7255,
"step": 292
},
{
"epoch": 0.24922913343965974,
"grad_norm": 0.5112527012825012,
"learning_rate": 0.00017231139786800042,
"loss": 2.8374,
"step": 293
},
{
"epoch": 0.2500797448165869,
"grad_norm": 0.5242344737052917,
"learning_rate": 0.0001721250302810101,
"loss": 2.9178,
"step": 294
},
{
"epoch": 0.2500797448165869,
"eval_loss": 2.688343048095703,
"eval_runtime": 80.6326,
"eval_samples_per_second": 12.278,
"eval_steps_per_second": 6.139,
"step": 294
}
],
"logging_steps": 1,
"max_steps": 1176,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 294,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.987046260755661e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}