Whisper-small-Ar-MDD / trainer_state.json
nrshoudi's picture
End of training
08eb3e7 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 5460,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05,
"grad_norm": 0.4075450599193573,
"learning_rate": 0.0005,
"loss": 0.0571,
"step": 25
},
{
"epoch": 0.09,
"grad_norm": 0.5959680676460266,
"learning_rate": 0.001,
"loss": 0.0853,
"step": 50
},
{
"epoch": 0.14,
"grad_norm": 1.1371592283248901,
"learning_rate": 0.0009955637707948243,
"loss": 0.0983,
"step": 75
},
{
"epoch": 0.18,
"grad_norm": 1.7857468128204346,
"learning_rate": 0.0009911275415896488,
"loss": 0.0689,
"step": 100
},
{
"epoch": 0.23,
"grad_norm": 1.947630763053894,
"learning_rate": 0.0009865064695009243,
"loss": 0.0834,
"step": 125
},
{
"epoch": 0.27,
"grad_norm": 0.750166654586792,
"learning_rate": 0.0009818853974121996,
"loss": 0.0655,
"step": 150
},
{
"epoch": 0.32,
"grad_norm": 3.860727548599243,
"learning_rate": 0.000977264325323475,
"loss": 0.0554,
"step": 175
},
{
"epoch": 0.37,
"grad_norm": 0.3753944933414459,
"learning_rate": 0.0009726432532347505,
"loss": 0.0785,
"step": 200
},
{
"epoch": 0.41,
"grad_norm": 0.4372863471508026,
"learning_rate": 0.0009680221811460259,
"loss": 0.063,
"step": 225
},
{
"epoch": 0.46,
"grad_norm": 0.31646546721458435,
"learning_rate": 0.0009634011090573013,
"loss": 0.0487,
"step": 250
},
{
"epoch": 0.5,
"grad_norm": 0.8565055131912231,
"learning_rate": 0.0009587800369685768,
"loss": 0.0586,
"step": 275
},
{
"epoch": 0.55,
"grad_norm": 0.5980587601661682,
"learning_rate": 0.0009541589648798521,
"loss": 0.0683,
"step": 300
},
{
"epoch": 0.6,
"grad_norm": 0.2764056324958801,
"learning_rate": 0.0009495378927911276,
"loss": 0.0697,
"step": 325
},
{
"epoch": 0.64,
"grad_norm": 1.5360766649246216,
"learning_rate": 0.0009449168207024029,
"loss": 0.0639,
"step": 350
},
{
"epoch": 0.69,
"grad_norm": 0.38272273540496826,
"learning_rate": 0.0009402957486136784,
"loss": 0.0816,
"step": 375
},
{
"epoch": 0.73,
"grad_norm": 0.362632155418396,
"learning_rate": 0.0009356746765249538,
"loss": 0.0734,
"step": 400
},
{
"epoch": 0.78,
"grad_norm": 1.5462536811828613,
"learning_rate": 0.0009310536044362292,
"loss": 0.2255,
"step": 425
},
{
"epoch": 0.82,
"grad_norm": 0.6713312268257141,
"learning_rate": 0.0009264325323475047,
"loss": 0.1296,
"step": 450
},
{
"epoch": 0.87,
"grad_norm": 0.8870647549629211,
"learning_rate": 0.0009219963031423291,
"loss": 0.0904,
"step": 475
},
{
"epoch": 0.92,
"grad_norm": 0.4918694496154785,
"learning_rate": 0.0009173752310536044,
"loss": 0.0689,
"step": 500
},
{
"epoch": 0.96,
"grad_norm": 0.3674885630607605,
"learning_rate": 0.0009127541589648799,
"loss": 0.0726,
"step": 525
},
{
"epoch": 1.0,
"eval_loss": 0.2210056632757187,
"eval_runtime": 173.4286,
"eval_samples_per_second": 4.688,
"eval_steps_per_second": 0.784,
"step": 546
},
{
"epoch": 1.01,
"grad_norm": 0.8691617250442505,
"learning_rate": 0.0009081330868761552,
"loss": 0.0421,
"step": 550
},
{
"epoch": 1.05,
"grad_norm": 0.8861550688743591,
"learning_rate": 0.0009035120147874307,
"loss": 0.058,
"step": 575
},
{
"epoch": 1.1,
"grad_norm": 0.10453300923109055,
"learning_rate": 0.000898890942698706,
"loss": 0.0391,
"step": 600
},
{
"epoch": 1.14,
"grad_norm": 0.11498710513114929,
"learning_rate": 0.0008942698706099815,
"loss": 0.0413,
"step": 625
},
{
"epoch": 1.19,
"grad_norm": 0.3542003333568573,
"learning_rate": 0.0008896487985212569,
"loss": 0.0764,
"step": 650
},
{
"epoch": 1.24,
"grad_norm": 0.5665566921234131,
"learning_rate": 0.0008850277264325323,
"loss": 0.0464,
"step": 675
},
{
"epoch": 1.28,
"grad_norm": 0.37183037400245667,
"learning_rate": 0.0008804066543438077,
"loss": 0.0557,
"step": 700
},
{
"epoch": 1.33,
"grad_norm": 2.33689546585083,
"learning_rate": 0.0008757855822550833,
"loss": 0.0723,
"step": 725
},
{
"epoch": 1.37,
"grad_norm": 0.47746214270591736,
"learning_rate": 0.0008711645101663586,
"loss": 0.0613,
"step": 750
},
{
"epoch": 1.42,
"grad_norm": 0.5573539137840271,
"learning_rate": 0.0008665434380776341,
"loss": 0.0581,
"step": 775
},
{
"epoch": 1.47,
"grad_norm": 0.5228638648986816,
"learning_rate": 0.0008619223659889095,
"loss": 0.0438,
"step": 800
},
{
"epoch": 1.51,
"grad_norm": 1.0585103034973145,
"learning_rate": 0.0008573012939001849,
"loss": 0.0357,
"step": 825
},
{
"epoch": 1.56,
"grad_norm": 0.13868175446987152,
"learning_rate": 0.0008526802218114603,
"loss": 0.0374,
"step": 850
},
{
"epoch": 1.6,
"grad_norm": 0.43853959441185,
"learning_rate": 0.0008480591497227357,
"loss": 0.0482,
"step": 875
},
{
"epoch": 1.65,
"grad_norm": 0.47208574414253235,
"learning_rate": 0.0008434380776340112,
"loss": 0.0551,
"step": 900
},
{
"epoch": 1.69,
"grad_norm": 0.2631681561470032,
"learning_rate": 0.0008388170055452865,
"loss": 0.0717,
"step": 925
},
{
"epoch": 1.74,
"grad_norm": 0.23163950443267822,
"learning_rate": 0.000834195933456562,
"loss": 0.0415,
"step": 950
},
{
"epoch": 1.79,
"grad_norm": 0.45487725734710693,
"learning_rate": 0.0008295748613678373,
"loss": 0.0392,
"step": 975
},
{
"epoch": 1.83,
"grad_norm": 0.40454888343811035,
"learning_rate": 0.0008249537892791128,
"loss": 0.0342,
"step": 1000
},
{
"epoch": 1.88,
"grad_norm": 0.10719649493694305,
"learning_rate": 0.0008203327171903881,
"loss": 0.0499,
"step": 1025
},
{
"epoch": 1.92,
"grad_norm": 0.5795795917510986,
"learning_rate": 0.0008157116451016636,
"loss": 0.0553,
"step": 1050
},
{
"epoch": 1.97,
"grad_norm": 0.2069532871246338,
"learning_rate": 0.000811090573012939,
"loss": 0.0419,
"step": 1075
},
{
"epoch": 2.0,
"eval_loss": 0.21386997401714325,
"eval_runtime": 174.0952,
"eval_samples_per_second": 4.67,
"eval_steps_per_second": 0.781,
"step": 1092
},
{
"epoch": 2.01,
"grad_norm": 0.170976459980011,
"learning_rate": 0.0008064695009242144,
"loss": 0.0373,
"step": 1100
},
{
"epoch": 2.06,
"grad_norm": 0.10965342819690704,
"learning_rate": 0.0008018484288354898,
"loss": 0.0286,
"step": 1125
},
{
"epoch": 2.11,
"grad_norm": 0.02158469147980213,
"learning_rate": 0.0007972273567467652,
"loss": 0.05,
"step": 1150
},
{
"epoch": 2.15,
"grad_norm": 1.0225136280059814,
"learning_rate": 0.0007926062846580406,
"loss": 0.0423,
"step": 1175
},
{
"epoch": 2.2,
"grad_norm": 0.09866318106651306,
"learning_rate": 0.0007879852125693162,
"loss": 0.0376,
"step": 1200
},
{
"epoch": 2.24,
"grad_norm": 0.23199380934238434,
"learning_rate": 0.0007833641404805915,
"loss": 0.0293,
"step": 1225
},
{
"epoch": 2.29,
"grad_norm": 0.05752483755350113,
"learning_rate": 0.000778743068391867,
"loss": 0.0381,
"step": 1250
},
{
"epoch": 2.34,
"grad_norm": 0.13506996631622314,
"learning_rate": 0.0007741219963031424,
"loss": 0.0394,
"step": 1275
},
{
"epoch": 2.38,
"grad_norm": 1.1013309955596924,
"learning_rate": 0.0007695009242144178,
"loss": 0.0394,
"step": 1300
},
{
"epoch": 2.43,
"grad_norm": 0.43956679105758667,
"learning_rate": 0.0007648798521256932,
"loss": 0.0459,
"step": 1325
},
{
"epoch": 2.47,
"grad_norm": 0.39061295986175537,
"learning_rate": 0.0007602587800369686,
"loss": 0.037,
"step": 1350
},
{
"epoch": 2.52,
"grad_norm": 0.2657981216907501,
"learning_rate": 0.0007556377079482441,
"loss": 0.0327,
"step": 1375
},
{
"epoch": 2.56,
"grad_norm": 0.4138255715370178,
"learning_rate": 0.0007510166358595194,
"loss": 0.0307,
"step": 1400
},
{
"epoch": 2.61,
"grad_norm": 0.32367995381355286,
"learning_rate": 0.0007463955637707949,
"loss": 0.0335,
"step": 1425
},
{
"epoch": 2.66,
"grad_norm": 0.5355994701385498,
"learning_rate": 0.0007417744916820702,
"loss": 0.0262,
"step": 1450
},
{
"epoch": 2.7,
"grad_norm": 3.182929039001465,
"learning_rate": 0.0007371534195933457,
"loss": 0.0302,
"step": 1475
},
{
"epoch": 2.75,
"grad_norm": 0.9068237543106079,
"learning_rate": 0.000732532347504621,
"loss": 0.0318,
"step": 1500
},
{
"epoch": 2.79,
"grad_norm": 0.804796576499939,
"learning_rate": 0.0007279112754158965,
"loss": 0.0462,
"step": 1525
},
{
"epoch": 2.84,
"grad_norm": 0.40627536177635193,
"learning_rate": 0.0007232902033271719,
"loss": 0.0226,
"step": 1550
},
{
"epoch": 2.88,
"grad_norm": 0.2852160632610321,
"learning_rate": 0.0007186691312384473,
"loss": 0.0327,
"step": 1575
},
{
"epoch": 2.93,
"grad_norm": 0.5738157629966736,
"learning_rate": 0.0007140480591497227,
"loss": 0.0317,
"step": 1600
},
{
"epoch": 2.98,
"grad_norm": 0.2782443165779114,
"learning_rate": 0.0007094269870609981,
"loss": 0.0322,
"step": 1625
},
{
"epoch": 3.0,
"eval_loss": 0.1934811770915985,
"eval_runtime": 175.4238,
"eval_samples_per_second": 4.634,
"eval_steps_per_second": 0.775,
"step": 1638
},
{
"epoch": 3.02,
"grad_norm": 0.027267010882496834,
"learning_rate": 0.0007048059149722735,
"loss": 0.0248,
"step": 1650
},
{
"epoch": 3.07,
"grad_norm": 0.23983055353164673,
"learning_rate": 0.000700184842883549,
"loss": 0.0252,
"step": 1675
},
{
"epoch": 3.11,
"grad_norm": 0.03389419987797737,
"learning_rate": 0.0006955637707948245,
"loss": 0.0216,
"step": 1700
},
{
"epoch": 3.16,
"grad_norm": 2.448323965072632,
"learning_rate": 0.0006909426987060999,
"loss": 0.0402,
"step": 1725
},
{
"epoch": 3.21,
"grad_norm": 0.5986452102661133,
"learning_rate": 0.0006863216266173753,
"loss": 0.0349,
"step": 1750
},
{
"epoch": 3.25,
"grad_norm": 0.046656377613544464,
"learning_rate": 0.0006817005545286507,
"loss": 0.0179,
"step": 1775
},
{
"epoch": 3.3,
"grad_norm": 0.2432301789522171,
"learning_rate": 0.0006770794824399261,
"loss": 0.0261,
"step": 1800
},
{
"epoch": 3.34,
"grad_norm": 0.4144662022590637,
"learning_rate": 0.0006724584103512015,
"loss": 0.0256,
"step": 1825
},
{
"epoch": 3.39,
"grad_norm": 0.27171510457992554,
"learning_rate": 0.000667837338262477,
"loss": 0.0322,
"step": 1850
},
{
"epoch": 3.43,
"grad_norm": 0.1022319346666336,
"learning_rate": 0.0006632162661737523,
"loss": 0.0293,
"step": 1875
},
{
"epoch": 3.48,
"grad_norm": 0.16478094458580017,
"learning_rate": 0.0006585951940850278,
"loss": 0.0178,
"step": 1900
},
{
"epoch": 3.53,
"grad_norm": 0.1675555408000946,
"learning_rate": 0.0006539741219963031,
"loss": 0.0174,
"step": 1925
},
{
"epoch": 3.57,
"grad_norm": 0.39023590087890625,
"learning_rate": 0.0006493530499075786,
"loss": 0.0149,
"step": 1950
},
{
"epoch": 3.62,
"grad_norm": 0.025721503421664238,
"learning_rate": 0.0006447319778188539,
"loss": 0.0231,
"step": 1975
},
{
"epoch": 3.66,
"grad_norm": 0.3088337182998657,
"learning_rate": 0.0006401109057301294,
"loss": 0.0283,
"step": 2000
},
{
"epoch": 3.71,
"grad_norm": 0.06729228049516678,
"learning_rate": 0.0006354898336414048,
"loss": 0.0204,
"step": 2025
},
{
"epoch": 3.75,
"grad_norm": 0.18552298843860626,
"learning_rate": 0.0006308687615526802,
"loss": 0.0274,
"step": 2050
},
{
"epoch": 3.8,
"grad_norm": 0.08045148104429245,
"learning_rate": 0.0006262476894639556,
"loss": 0.0218,
"step": 2075
},
{
"epoch": 3.85,
"grad_norm": 0.6443850994110107,
"learning_rate": 0.000621626617375231,
"loss": 0.0207,
"step": 2100
},
{
"epoch": 3.89,
"grad_norm": 0.6463542580604553,
"learning_rate": 0.0006170055452865064,
"loss": 0.0322,
"step": 2125
},
{
"epoch": 3.94,
"grad_norm": 0.2903934419155121,
"learning_rate": 0.000612384473197782,
"loss": 0.031,
"step": 2150
},
{
"epoch": 3.98,
"grad_norm": 0.1343035101890564,
"learning_rate": 0.0006077634011090574,
"loss": 0.0175,
"step": 2175
},
{
"epoch": 4.0,
"eval_loss": 0.1896440088748932,
"eval_runtime": 176.3159,
"eval_samples_per_second": 4.611,
"eval_steps_per_second": 0.771,
"step": 2184
},
{
"epoch": 4.03,
"grad_norm": 0.10466930270195007,
"learning_rate": 0.0006031423290203328,
"loss": 0.0215,
"step": 2200
},
{
"epoch": 4.08,
"grad_norm": 0.35988566279411316,
"learning_rate": 0.0005985212569316082,
"loss": 0.0193,
"step": 2225
},
{
"epoch": 4.12,
"grad_norm": 0.16410423815250397,
"learning_rate": 0.0005939001848428836,
"loss": 0.0143,
"step": 2250
},
{
"epoch": 4.17,
"grad_norm": 0.2650511562824249,
"learning_rate": 0.000589279112754159,
"loss": 0.0268,
"step": 2275
},
{
"epoch": 4.21,
"grad_norm": 0.2793768048286438,
"learning_rate": 0.0005846580406654344,
"loss": 0.0159,
"step": 2300
},
{
"epoch": 4.26,
"grad_norm": 2.7625114917755127,
"learning_rate": 0.0005800369685767099,
"loss": 0.0226,
"step": 2325
},
{
"epoch": 4.3,
"grad_norm": 0.45461520552635193,
"learning_rate": 0.0005754158964879852,
"loss": 0.0137,
"step": 2350
},
{
"epoch": 4.35,
"grad_norm": 0.28511613607406616,
"learning_rate": 0.0005707948243992607,
"loss": 0.0184,
"step": 2375
},
{
"epoch": 4.4,
"grad_norm": 0.5333670377731323,
"learning_rate": 0.000566173752310536,
"loss": 0.0186,
"step": 2400
},
{
"epoch": 4.44,
"grad_norm": 0.41222718358039856,
"learning_rate": 0.0005615526802218115,
"loss": 0.011,
"step": 2425
},
{
"epoch": 4.49,
"grad_norm": 0.27146583795547485,
"learning_rate": 0.0005569316081330868,
"loss": 0.0165,
"step": 2450
},
{
"epoch": 4.53,
"grad_norm": 0.29553595185279846,
"learning_rate": 0.0005523105360443623,
"loss": 0.0138,
"step": 2475
},
{
"epoch": 4.58,
"grad_norm": 0.13532432913780212,
"learning_rate": 0.0005476894639556377,
"loss": 0.0167,
"step": 2500
},
{
"epoch": 4.62,
"grad_norm": 0.10051342844963074,
"learning_rate": 0.0005430683918669131,
"loss": 0.0152,
"step": 2525
},
{
"epoch": 4.67,
"grad_norm": 0.023720353841781616,
"learning_rate": 0.0005384473197781885,
"loss": 0.0155,
"step": 2550
},
{
"epoch": 4.72,
"grad_norm": 0.2686695456504822,
"learning_rate": 0.0005338262476894639,
"loss": 0.0125,
"step": 2575
},
{
"epoch": 4.76,
"grad_norm": 0.33857473731040955,
"learning_rate": 0.0005292051756007393,
"loss": 0.0332,
"step": 2600
},
{
"epoch": 4.81,
"grad_norm": 0.0131806880235672,
"learning_rate": 0.0005245841035120147,
"loss": 0.014,
"step": 2625
},
{
"epoch": 4.85,
"grad_norm": 0.4342842698097229,
"learning_rate": 0.0005199630314232903,
"loss": 0.016,
"step": 2650
},
{
"epoch": 4.9,
"grad_norm": 0.005540889222174883,
"learning_rate": 0.0005153419593345657,
"loss": 0.0134,
"step": 2675
},
{
"epoch": 4.95,
"grad_norm": 0.004122666083276272,
"learning_rate": 0.0005107208872458411,
"loss": 0.0223,
"step": 2700
},
{
"epoch": 4.99,
"grad_norm": 0.14384405314922333,
"learning_rate": 0.0005060998151571165,
"loss": 0.0266,
"step": 2725
},
{
"epoch": 5.0,
"eval_loss": 0.19267761707305908,
"eval_runtime": 179.8301,
"eval_samples_per_second": 4.521,
"eval_steps_per_second": 0.756,
"step": 2730
},
{
"epoch": 5.04,
"grad_norm": 0.3819844126701355,
"learning_rate": 0.0005014787430683919,
"loss": 0.0166,
"step": 2750
},
{
"epoch": 5.08,
"grad_norm": 0.27138832211494446,
"learning_rate": 0.0004968576709796673,
"loss": 0.0056,
"step": 2775
},
{
"epoch": 5.13,
"grad_norm": 0.36033156514167786,
"learning_rate": 0.0004922365988909427,
"loss": 0.0084,
"step": 2800
},
{
"epoch": 5.17,
"grad_norm": 0.3422500789165497,
"learning_rate": 0.0004876155268022181,
"loss": 0.0089,
"step": 2825
},
{
"epoch": 5.22,
"grad_norm": 0.12272176891565323,
"learning_rate": 0.0004829944547134935,
"loss": 0.0079,
"step": 2850
},
{
"epoch": 5.27,
"grad_norm": 0.03446231782436371,
"learning_rate": 0.000478373382624769,
"loss": 0.011,
"step": 2875
},
{
"epoch": 5.31,
"grad_norm": 0.2042599320411682,
"learning_rate": 0.0004737523105360444,
"loss": 0.0091,
"step": 2900
},
{
"epoch": 5.36,
"grad_norm": 0.18888217210769653,
"learning_rate": 0.0004691312384473198,
"loss": 0.0146,
"step": 2925
},
{
"epoch": 5.4,
"grad_norm": 4.216693878173828,
"learning_rate": 0.0004645101663585952,
"loss": 0.0162,
"step": 2950
},
{
"epoch": 5.45,
"grad_norm": 0.20249082148075104,
"learning_rate": 0.0004598890942698706,
"loss": 0.0193,
"step": 2975
},
{
"epoch": 5.49,
"grad_norm": 0.37886273860931396,
"learning_rate": 0.00045526802218114607,
"loss": 0.0163,
"step": 3000
},
{
"epoch": 5.54,
"grad_norm": 0.24141408503055573,
"learning_rate": 0.0004506469500924215,
"loss": 0.0147,
"step": 3025
},
{
"epoch": 5.59,
"grad_norm": 0.23406554758548737,
"learning_rate": 0.0004460258780036969,
"loss": 0.0145,
"step": 3050
},
{
"epoch": 5.63,
"grad_norm": 0.355023056268692,
"learning_rate": 0.0004414048059149723,
"loss": 0.0144,
"step": 3075
},
{
"epoch": 5.68,
"grad_norm": 0.18628603219985962,
"learning_rate": 0.0004367837338262477,
"loss": 0.0105,
"step": 3100
},
{
"epoch": 5.72,
"grad_norm": 0.328931987285614,
"learning_rate": 0.0004321626617375231,
"loss": 0.0107,
"step": 3125
},
{
"epoch": 5.77,
"grad_norm": 0.004133810754865408,
"learning_rate": 0.0004275415896487985,
"loss": 0.01,
"step": 3150
},
{
"epoch": 5.82,
"grad_norm": 0.036314379423856735,
"learning_rate": 0.0004229205175600739,
"loss": 0.0127,
"step": 3175
},
{
"epoch": 5.86,
"grad_norm": 0.27704620361328125,
"learning_rate": 0.00041829944547134933,
"loss": 0.0111,
"step": 3200
},
{
"epoch": 5.91,
"grad_norm": 0.5109962821006775,
"learning_rate": 0.00041367837338262474,
"loss": 0.0157,
"step": 3225
},
{
"epoch": 5.95,
"grad_norm": 0.09048620611429214,
"learning_rate": 0.0004090573012939002,
"loss": 0.0184,
"step": 3250
},
{
"epoch": 6.0,
"grad_norm": 0.010707640089094639,
"learning_rate": 0.0004044362292051756,
"loss": 0.0178,
"step": 3275
},
{
"epoch": 6.0,
"eval_loss": 0.20126062631607056,
"eval_runtime": 179.6758,
"eval_samples_per_second": 4.525,
"eval_steps_per_second": 0.757,
"step": 3276
},
{
"epoch": 6.04,
"grad_norm": 0.013095181435346603,
"learning_rate": 0.000399815157116451,
"loss": 0.0106,
"step": 3300
},
{
"epoch": 6.09,
"grad_norm": 0.15969859063625336,
"learning_rate": 0.0003951940850277264,
"loss": 0.0098,
"step": 3325
},
{
"epoch": 6.14,
"grad_norm": 0.09395785629749298,
"learning_rate": 0.0003905730129390019,
"loss": 0.012,
"step": 3350
},
{
"epoch": 6.18,
"grad_norm": 0.010071701370179653,
"learning_rate": 0.0003859519408502773,
"loss": 0.0071,
"step": 3375
},
{
"epoch": 6.23,
"grad_norm": 0.005003762431442738,
"learning_rate": 0.0003813308687615527,
"loss": 0.0093,
"step": 3400
},
{
"epoch": 6.27,
"grad_norm": 0.01751079224050045,
"learning_rate": 0.0003767097966728281,
"loss": 0.0093,
"step": 3425
},
{
"epoch": 6.32,
"grad_norm": 0.048858534544706345,
"learning_rate": 0.0003720887245841035,
"loss": 0.0092,
"step": 3450
},
{
"epoch": 6.36,
"grad_norm": 0.05492233484983444,
"learning_rate": 0.0003674676524953789,
"loss": 0.0137,
"step": 3475
},
{
"epoch": 6.41,
"grad_norm": 0.011647823266685009,
"learning_rate": 0.0003628465804066544,
"loss": 0.008,
"step": 3500
},
{
"epoch": 6.46,
"grad_norm": 0.02023889683187008,
"learning_rate": 0.0003582255083179298,
"loss": 0.0064,
"step": 3525
},
{
"epoch": 6.5,
"grad_norm": 0.1093795895576477,
"learning_rate": 0.0003536044362292052,
"loss": 0.0087,
"step": 3550
},
{
"epoch": 6.55,
"grad_norm": 0.33639025688171387,
"learning_rate": 0.0003489833641404806,
"loss": 0.0127,
"step": 3575
},
{
"epoch": 6.59,
"grad_norm": 0.08823797106742859,
"learning_rate": 0.000344362292051756,
"loss": 0.0112,
"step": 3600
},
{
"epoch": 6.64,
"grad_norm": 0.052434779703617096,
"learning_rate": 0.0003397412199630314,
"loss": 0.0116,
"step": 3625
},
{
"epoch": 6.68,
"grad_norm": 0.1535090208053589,
"learning_rate": 0.0003351201478743068,
"loss": 0.011,
"step": 3650
},
{
"epoch": 6.73,
"grad_norm": 0.2711283564567566,
"learning_rate": 0.00033049907578558223,
"loss": 0.007,
"step": 3675
},
{
"epoch": 6.78,
"grad_norm": 0.006919647566974163,
"learning_rate": 0.00032587800369685764,
"loss": 0.0098,
"step": 3700
},
{
"epoch": 6.82,
"grad_norm": 0.03872460126876831,
"learning_rate": 0.0003212569316081331,
"loss": 0.0092,
"step": 3725
},
{
"epoch": 6.87,
"grad_norm": 0.0396348237991333,
"learning_rate": 0.0003166358595194085,
"loss": 0.0126,
"step": 3750
},
{
"epoch": 6.91,
"grad_norm": 0.008865280076861382,
"learning_rate": 0.0003120147874306839,
"loss": 0.0097,
"step": 3775
},
{
"epoch": 6.96,
"grad_norm": 0.2857593894004822,
"learning_rate": 0.0003073937153419594,
"loss": 0.0081,
"step": 3800
},
{
"epoch": 7.0,
"eval_loss": 0.19787272810935974,
"eval_runtime": 178.7624,
"eval_samples_per_second": 4.548,
"eval_steps_per_second": 0.761,
"step": 3822
},
{
"epoch": 7.01,
"grad_norm": 0.162245973944664,
"learning_rate": 0.0003027726432532348,
"loss": 0.0091,
"step": 3825
},
{
"epoch": 7.05,
"grad_norm": 0.08275479078292847,
"learning_rate": 0.0002981515711645102,
"loss": 0.0064,
"step": 3850
},
{
"epoch": 7.1,
"grad_norm": 0.04956310614943504,
"learning_rate": 0.0002935304990757856,
"loss": 0.0033,
"step": 3875
},
{
"epoch": 7.14,
"grad_norm": 0.2950696647167206,
"learning_rate": 0.000288909426987061,
"loss": 0.0059,
"step": 3900
},
{
"epoch": 7.19,
"grad_norm": 0.16667646169662476,
"learning_rate": 0.0002842883548983364,
"loss": 0.0065,
"step": 3925
},
{
"epoch": 7.23,
"grad_norm": 0.018928788602352142,
"learning_rate": 0.0002796672828096118,
"loss": 0.0053,
"step": 3950
},
{
"epoch": 7.28,
"grad_norm": 0.01914687640964985,
"learning_rate": 0.0002750462107208873,
"loss": 0.0058,
"step": 3975
},
{
"epoch": 7.33,
"grad_norm": 0.009565665386617184,
"learning_rate": 0.0002704251386321627,
"loss": 0.0042,
"step": 4000
},
{
"epoch": 7.37,
"grad_norm": 0.10117679834365845,
"learning_rate": 0.0002658040665434381,
"loss": 0.0081,
"step": 4025
},
{
"epoch": 7.42,
"grad_norm": 0.10825569927692413,
"learning_rate": 0.0002611829944547135,
"loss": 0.0088,
"step": 4050
},
{
"epoch": 7.46,
"grad_norm": 0.008808852173388004,
"learning_rate": 0.0002565619223659889,
"loss": 0.0052,
"step": 4075
},
{
"epoch": 7.51,
"grad_norm": 0.0186983160674572,
"learning_rate": 0.0002519408502772643,
"loss": 0.0051,
"step": 4100
},
{
"epoch": 7.55,
"grad_norm": 0.07354945689439774,
"learning_rate": 0.0002473197781885397,
"loss": 0.0055,
"step": 4125
},
{
"epoch": 7.6,
"grad_norm": 0.0021155644208192825,
"learning_rate": 0.0002426987060998152,
"loss": 0.0044,
"step": 4150
},
{
"epoch": 7.65,
"grad_norm": 0.08616074174642563,
"learning_rate": 0.0002380776340110906,
"loss": 0.0037,
"step": 4175
},
{
"epoch": 7.69,
"grad_norm": 0.009911403059959412,
"learning_rate": 0.000233456561922366,
"loss": 0.0073,
"step": 4200
},
{
"epoch": 7.74,
"grad_norm": 0.36762863397598267,
"learning_rate": 0.0002288354898336414,
"loss": 0.004,
"step": 4225
},
{
"epoch": 7.78,
"grad_norm": 0.0590713806450367,
"learning_rate": 0.00022421441774491682,
"loss": 0.0034,
"step": 4250
},
{
"epoch": 7.83,
"grad_norm": 0.0876949205994606,
"learning_rate": 0.00021959334565619225,
"loss": 0.0061,
"step": 4275
},
{
"epoch": 7.88,
"grad_norm": 0.2488565295934677,
"learning_rate": 0.00021497227356746766,
"loss": 0.0047,
"step": 4300
},
{
"epoch": 7.92,
"grad_norm": 0.16184526681900024,
"learning_rate": 0.00021035120147874306,
"loss": 0.0064,
"step": 4325
},
{
"epoch": 7.97,
"grad_norm": 0.025223182514309883,
"learning_rate": 0.00020573012939001847,
"loss": 0.0081,
"step": 4350
},
{
"epoch": 8.0,
"eval_loss": 0.21132159233093262,
"eval_runtime": 178.6799,
"eval_samples_per_second": 4.55,
"eval_steps_per_second": 0.761,
"step": 4368
},
{
"epoch": 8.01,
"grad_norm": 0.04916756972670555,
"learning_rate": 0.00020110905730129388,
"loss": 0.0049,
"step": 4375
},
{
"epoch": 8.06,
"grad_norm": 0.010703769512474537,
"learning_rate": 0.00019648798521256934,
"loss": 0.0034,
"step": 4400
},
{
"epoch": 8.1,
"grad_norm": 0.004313566256314516,
"learning_rate": 0.00019186691312384475,
"loss": 0.003,
"step": 4425
},
{
"epoch": 8.15,
"grad_norm": 0.18936963379383087,
"learning_rate": 0.00018724584103512016,
"loss": 0.004,
"step": 4450
},
{
"epoch": 8.2,
"grad_norm": 0.0596047043800354,
"learning_rate": 0.00018262476894639556,
"loss": 0.0027,
"step": 4475
},
{
"epoch": 8.24,
"grad_norm": 0.0016723590670153499,
"learning_rate": 0.00017800369685767097,
"loss": 0.0037,
"step": 4500
},
{
"epoch": 8.29,
"grad_norm": 0.026407798752188683,
"learning_rate": 0.0001733826247689464,
"loss": 0.0026,
"step": 4525
},
{
"epoch": 8.33,
"grad_norm": 0.004466090817004442,
"learning_rate": 0.0001687615526802218,
"loss": 0.0044,
"step": 4550
},
{
"epoch": 8.38,
"grad_norm": 0.013297215104103088,
"learning_rate": 0.00016414048059149722,
"loss": 0.0044,
"step": 4575
},
{
"epoch": 8.42,
"grad_norm": 0.013365192338824272,
"learning_rate": 0.00015951940850277263,
"loss": 0.0033,
"step": 4600
},
{
"epoch": 8.47,
"grad_norm": 0.32592836022377014,
"learning_rate": 0.0001548983364140481,
"loss": 0.0056,
"step": 4625
},
{
"epoch": 8.52,
"grad_norm": 0.023310931399464607,
"learning_rate": 0.0001502772643253235,
"loss": 0.0017,
"step": 4650
},
{
"epoch": 8.56,
"grad_norm": 0.0938984677195549,
"learning_rate": 0.0001456561922365989,
"loss": 0.0028,
"step": 4675
},
{
"epoch": 8.61,
"grad_norm": 0.006782053969800472,
"learning_rate": 0.0001410351201478743,
"loss": 0.0019,
"step": 4700
},
{
"epoch": 8.65,
"grad_norm": 0.08395280689001083,
"learning_rate": 0.00013641404805914972,
"loss": 0.0024,
"step": 4725
},
{
"epoch": 8.7,
"grad_norm": 0.04261644929647446,
"learning_rate": 0.00013179297597042515,
"loss": 0.0029,
"step": 4750
},
{
"epoch": 8.75,
"grad_norm": 0.020602483302354813,
"learning_rate": 0.00012717190388170056,
"loss": 0.0025,
"step": 4775
},
{
"epoch": 8.79,
"grad_norm": 0.0013005019864067435,
"learning_rate": 0.00012255083179297597,
"loss": 0.0024,
"step": 4800
},
{
"epoch": 8.84,
"grad_norm": 0.0019000261090695858,
"learning_rate": 0.00011792975970425139,
"loss": 0.004,
"step": 4825
},
{
"epoch": 8.88,
"grad_norm": 0.02021609991788864,
"learning_rate": 0.00011330868761552681,
"loss": 0.0023,
"step": 4850
},
{
"epoch": 8.93,
"grad_norm": 0.012654704973101616,
"learning_rate": 0.00010868761552680221,
"loss": 0.0036,
"step": 4875
},
{
"epoch": 8.97,
"grad_norm": 0.009410886093974113,
"learning_rate": 0.00010406654343807764,
"loss": 0.0018,
"step": 4900
},
{
"epoch": 9.0,
"eval_loss": 0.2146490514278412,
"eval_runtime": 176.7137,
"eval_samples_per_second": 4.601,
"eval_steps_per_second": 0.77,
"step": 4914
},
{
"epoch": 9.02,
"grad_norm": 0.00876565556973219,
"learning_rate": 9.944547134935306e-05,
"loss": 0.0024,
"step": 4925
},
{
"epoch": 9.07,
"grad_norm": 0.001896819332614541,
"learning_rate": 9.482439926062846e-05,
"loss": 0.0013,
"step": 4950
},
{
"epoch": 9.11,
"grad_norm": 0.007586441468447447,
"learning_rate": 9.020332717190388e-05,
"loss": 0.0017,
"step": 4975
},
{
"epoch": 9.16,
"grad_norm": 0.006564935203641653,
"learning_rate": 8.558225508317929e-05,
"loss": 0.003,
"step": 5000
},
{
"epoch": 9.2,
"grad_norm": 0.005424303933978081,
"learning_rate": 8.096118299445473e-05,
"loss": 0.0014,
"step": 5025
},
{
"epoch": 9.25,
"grad_norm": 0.0165091622620821,
"learning_rate": 7.634011090573013e-05,
"loss": 0.0027,
"step": 5050
},
{
"epoch": 9.29,
"grad_norm": 0.09231999516487122,
"learning_rate": 7.171903881700554e-05,
"loss": 0.0018,
"step": 5075
},
{
"epoch": 9.34,
"grad_norm": 0.16238878667354584,
"learning_rate": 6.709796672828096e-05,
"loss": 0.0015,
"step": 5100
},
{
"epoch": 9.39,
"grad_norm": 0.04476441815495491,
"learning_rate": 6.247689463955638e-05,
"loss": 0.0011,
"step": 5125
},
{
"epoch": 9.43,
"grad_norm": 0.00874653086066246,
"learning_rate": 5.785582255083179e-05,
"loss": 0.0008,
"step": 5150
},
{
"epoch": 9.48,
"grad_norm": 0.010477906093001366,
"learning_rate": 5.323475046210721e-05,
"loss": 0.0021,
"step": 5175
},
{
"epoch": 9.52,
"grad_norm": 0.00953985471278429,
"learning_rate": 4.8613678373382625e-05,
"loss": 0.0017,
"step": 5200
},
{
"epoch": 9.57,
"grad_norm": 0.0022518846672028303,
"learning_rate": 4.3992606284658045e-05,
"loss": 0.0019,
"step": 5225
},
{
"epoch": 9.62,
"grad_norm": 0.037685129791498184,
"learning_rate": 3.937153419593346e-05,
"loss": 0.001,
"step": 5250
},
{
"epoch": 9.66,
"grad_norm": 0.08190955966711044,
"learning_rate": 3.4750462107208874e-05,
"loss": 0.0017,
"step": 5275
},
{
"epoch": 9.71,
"grad_norm": 0.017375241965055466,
"learning_rate": 3.012939001848429e-05,
"loss": 0.0016,
"step": 5300
},
{
"epoch": 9.75,
"grad_norm": 0.03486447408795357,
"learning_rate": 2.5508317929759705e-05,
"loss": 0.0012,
"step": 5325
},
{
"epoch": 9.8,
"grad_norm": 0.0786125510931015,
"learning_rate": 2.088724584103512e-05,
"loss": 0.0012,
"step": 5350
},
{
"epoch": 9.84,
"grad_norm": 0.09049534052610397,
"learning_rate": 1.6266173752310537e-05,
"loss": 0.0012,
"step": 5375
},
{
"epoch": 9.89,
"grad_norm": 0.012832165695726871,
"learning_rate": 1.1645101663585952e-05,
"loss": 0.0014,
"step": 5400
},
{
"epoch": 9.94,
"grad_norm": 0.006516186986118555,
"learning_rate": 7.024029574861368e-06,
"loss": 0.0015,
"step": 5425
},
{
"epoch": 9.98,
"grad_norm": 0.02494051493704319,
"learning_rate": 2.402957486136784e-06,
"loss": 0.0015,
"step": 5450
},
{
"epoch": 10.0,
"eval_loss": 0.221242755651474,
"eval_runtime": 176.7742,
"eval_samples_per_second": 4.599,
"eval_steps_per_second": 0.769,
"step": 5460
},
{
"epoch": 10.0,
"step": 5460,
"total_flos": 9.7789895073792e+18,
"train_loss": 0.024085146698745945,
"train_runtime": 10729.5864,
"train_samples_per_second": 3.05,
"train_steps_per_second": 0.509
}
],
"logging_steps": 25,
"max_steps": 5460,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"total_flos": 9.7789895073792e+18,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}