tower-7b-translate-ro-rup-en / trainer_state.json
snisioi's picture
Upload folder using huggingface_hub
1c0af83 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9898162809361648,
"eval_steps": 300,
"global_step": 5764,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00034704899898054355,
"eval_loss": 3.3464338779449463,
"eval_runtime": 18.8384,
"eval_samples_per_second": 23.357,
"eval_steps_per_second": 23.357,
"step": 1
},
{
"epoch": 0.00867622497451359,
"grad_norm": 11.25,
"learning_rate": 5e-06,
"loss": 2.9137,
"step": 25
},
{
"epoch": 0.01735244994902718,
"grad_norm": 5.46875,
"learning_rate": 1e-05,
"loss": 2.2899,
"step": 50
},
{
"epoch": 0.026028674923540766,
"grad_norm": 6.90625,
"learning_rate": 1.5000000000000002e-05,
"loss": 1.8283,
"step": 75
},
{
"epoch": 0.03470489989805436,
"grad_norm": 5.09375,
"learning_rate": 2e-05,
"loss": 1.5934,
"step": 100
},
{
"epoch": 0.04338112487256795,
"grad_norm": 5.125,
"learning_rate": 1.9999763673911112e-05,
"loss": 1.4074,
"step": 125
},
{
"epoch": 0.05205734984708153,
"grad_norm": 4.28125,
"learning_rate": 1.9999054706814453e-05,
"loss": 1.3029,
"step": 150
},
{
"epoch": 0.060733574821595124,
"grad_norm": 4.375,
"learning_rate": 1.9997873132219502e-05,
"loss": 1.2048,
"step": 175
},
{
"epoch": 0.06940979979610871,
"grad_norm": 4.21875,
"learning_rate": 1.9996219005973644e-05,
"loss": 1.1517,
"step": 200
},
{
"epoch": 0.0780860247706223,
"grad_norm": 4.25,
"learning_rate": 1.9994092406259516e-05,
"loss": 1.1061,
"step": 225
},
{
"epoch": 0.0867622497451359,
"grad_norm": 4.09375,
"learning_rate": 1.9991493433591315e-05,
"loss": 1.0597,
"step": 250
},
{
"epoch": 0.09543847471964947,
"grad_norm": 4.25,
"learning_rate": 1.998842221081005e-05,
"loss": 1.0366,
"step": 275
},
{
"epoch": 0.10411469969416307,
"grad_norm": 4.21875,
"learning_rate": 1.998487888307774e-05,
"loss": 1.0176,
"step": 300
},
{
"epoch": 0.10411469969416307,
"eval_loss": 1.072221279144287,
"eval_runtime": 19.1563,
"eval_samples_per_second": 22.969,
"eval_steps_per_second": 22.969,
"step": 300
},
{
"epoch": 0.11279092466867666,
"grad_norm": 4.03125,
"learning_rate": 1.998086361787053e-05,
"loss": 0.9971,
"step": 325
},
{
"epoch": 0.12146714964319025,
"grad_norm": 3.984375,
"learning_rate": 1.9976376604970818e-05,
"loss": 0.965,
"step": 350
},
{
"epoch": 0.13014337461770384,
"grad_norm": 3.875,
"learning_rate": 1.997141805645824e-05,
"loss": 0.941,
"step": 375
},
{
"epoch": 0.13881959959221743,
"grad_norm": 3.625,
"learning_rate": 1.996598820669967e-05,
"loss": 0.9471,
"step": 400
},
{
"epoch": 0.14749582456673102,
"grad_norm": 3.75,
"learning_rate": 1.9960087312338138e-05,
"loss": 0.9273,
"step": 425
},
{
"epoch": 0.1561720495412446,
"grad_norm": 3.265625,
"learning_rate": 1.995371565228071e-05,
"loss": 0.8989,
"step": 450
},
{
"epoch": 0.1648482745157582,
"grad_norm": 3.515625,
"learning_rate": 1.994687352768527e-05,
"loss": 0.8921,
"step": 475
},
{
"epoch": 0.1735244994902718,
"grad_norm": 3.265625,
"learning_rate": 1.9939561261946343e-05,
"loss": 0.8718,
"step": 500
},
{
"epoch": 0.18220072446478539,
"grad_norm": 3.46875,
"learning_rate": 1.9931779200679754e-05,
"loss": 0.8735,
"step": 525
},
{
"epoch": 0.19087694943929895,
"grad_norm": 3.53125,
"learning_rate": 1.992352771170633e-05,
"loss": 0.8747,
"step": 550
},
{
"epoch": 0.19955317441381254,
"grad_norm": 3.734375,
"learning_rate": 1.9914807185034483e-05,
"loss": 0.8315,
"step": 575
},
{
"epoch": 0.20822939938832613,
"grad_norm": 3.390625,
"learning_rate": 1.9905618032841812e-05,
"loss": 0.8365,
"step": 600
},
{
"epoch": 0.20822939938832613,
"eval_loss": 0.9373729228973389,
"eval_runtime": 18.8365,
"eval_samples_per_second": 23.359,
"eval_steps_per_second": 23.359,
"step": 600
},
{
"epoch": 0.21690562436283972,
"grad_norm": 3.5625,
"learning_rate": 1.9895960689455598e-05,
"loss": 0.8469,
"step": 625
},
{
"epoch": 0.2255818493373533,
"grad_norm": 3.390625,
"learning_rate": 1.9885835611332278e-05,
"loss": 0.8306,
"step": 650
},
{
"epoch": 0.2342580743118669,
"grad_norm": 3.171875,
"learning_rate": 1.987524327703587e-05,
"loss": 0.7991,
"step": 675
},
{
"epoch": 0.2429342992863805,
"grad_norm": 3.1875,
"learning_rate": 1.986418418721537e-05,
"loss": 0.8085,
"step": 700
},
{
"epoch": 0.2516105242608941,
"grad_norm": 3.25,
"learning_rate": 1.9852658864581063e-05,
"loss": 0.7983,
"step": 725
},
{
"epoch": 0.2602867492354077,
"grad_norm": 2.984375,
"learning_rate": 1.9840667853879827e-05,
"loss": 0.7847,
"step": 750
},
{
"epoch": 0.26896297420992127,
"grad_norm": 3.234375,
"learning_rate": 1.9828211721869404e-05,
"loss": 0.7744,
"step": 775
},
{
"epoch": 0.27763919918443486,
"grad_norm": 3.171875,
"learning_rate": 1.9815291057291583e-05,
"loss": 0.7846,
"step": 800
},
{
"epoch": 0.28631542415894845,
"grad_norm": 3.21875,
"learning_rate": 1.980190647084438e-05,
"loss": 0.7874,
"step": 825
},
{
"epoch": 0.29499164913346204,
"grad_norm": 3.28125,
"learning_rate": 1.9788058595153202e-05,
"loss": 0.7744,
"step": 850
},
{
"epoch": 0.30366787410797563,
"grad_norm": 3.28125,
"learning_rate": 1.97737480847409e-05,
"loss": 0.7565,
"step": 875
},
{
"epoch": 0.3123440990824892,
"grad_norm": 3.15625,
"learning_rate": 1.9758975615996874e-05,
"loss": 0.7477,
"step": 900
},
{
"epoch": 0.3123440990824892,
"eval_loss": 0.8618763089179993,
"eval_runtime": 39.106,
"eval_samples_per_second": 11.251,
"eval_steps_per_second": 11.251,
"step": 900
},
{
"epoch": 0.3210203240570028,
"grad_norm": 2.9375,
"learning_rate": 1.9743741887145067e-05,
"loss": 0.7589,
"step": 925
},
{
"epoch": 0.3296965490315164,
"grad_norm": 2.96875,
"learning_rate": 1.9728047618210995e-05,
"loss": 0.7397,
"step": 950
},
{
"epoch": 0.33837277400603,
"grad_norm": 3.140625,
"learning_rate": 1.9711893550987696e-05,
"loss": 0.7504,
"step": 975
},
{
"epoch": 0.3470489989805436,
"grad_norm": 3.34375,
"learning_rate": 1.969528044900068e-05,
"loss": 0.7365,
"step": 1000
},
{
"epoch": 0.3557252239550572,
"grad_norm": 2.921875,
"learning_rate": 1.967820909747182e-05,
"loss": 0.7463,
"step": 1025
},
{
"epoch": 0.36440144892957077,
"grad_norm": 3.546875,
"learning_rate": 1.9660680303282273e-05,
"loss": 0.7175,
"step": 1050
},
{
"epoch": 0.3730776739040843,
"grad_norm": 3.515625,
"learning_rate": 1.964269489493431e-05,
"loss": 0.7475,
"step": 1075
},
{
"epoch": 0.3817538988785979,
"grad_norm": 2.953125,
"learning_rate": 1.9624253722512174e-05,
"loss": 0.7255,
"step": 1100
},
{
"epoch": 0.3904301238531115,
"grad_norm": 2.953125,
"learning_rate": 1.9605357657641896e-05,
"loss": 0.7322,
"step": 1125
},
{
"epoch": 0.3991063488276251,
"grad_norm": 3.0,
"learning_rate": 1.9586007593450098e-05,
"loss": 0.7329,
"step": 1150
},
{
"epoch": 0.40778257380213867,
"grad_norm": 2.875,
"learning_rate": 1.9566204444521776e-05,
"loss": 0.7143,
"step": 1175
},
{
"epoch": 0.41645879877665226,
"grad_norm": 3.4375,
"learning_rate": 1.954594914685708e-05,
"loss": 0.702,
"step": 1200
},
{
"epoch": 0.41645879877665226,
"eval_loss": 0.8318689465522766,
"eval_runtime": 18.8241,
"eval_samples_per_second": 23.374,
"eval_steps_per_second": 23.374,
"step": 1200
},
{
"epoch": 0.42513502375116585,
"grad_norm": 3.15625,
"learning_rate": 1.9525242657827063e-05,
"loss": 0.7272,
"step": 1225
},
{
"epoch": 0.43381124872567944,
"grad_norm": 3.375,
"learning_rate": 1.9504085956128437e-05,
"loss": 0.7043,
"step": 1250
},
{
"epoch": 0.44248747370019303,
"grad_norm": 2.9375,
"learning_rate": 1.9482480041737312e-05,
"loss": 0.7123,
"step": 1275
},
{
"epoch": 0.4511636986747066,
"grad_norm": 2.890625,
"learning_rate": 1.946042593586195e-05,
"loss": 0.693,
"step": 1300
},
{
"epoch": 0.4598399236492202,
"grad_norm": 2.9375,
"learning_rate": 1.9437924680894456e-05,
"loss": 0.7004,
"step": 1325
},
{
"epoch": 0.4685161486237338,
"grad_norm": 3.1875,
"learning_rate": 1.941497734036155e-05,
"loss": 0.6827,
"step": 1350
},
{
"epoch": 0.4771923735982474,
"grad_norm": 3.15625,
"learning_rate": 1.939158499887428e-05,
"loss": 0.6949,
"step": 1375
},
{
"epoch": 0.485868598572761,
"grad_norm": 3.328125,
"learning_rate": 1.936774876207676e-05,
"loss": 0.7027,
"step": 1400
},
{
"epoch": 0.4945448235472746,
"grad_norm": 2.953125,
"learning_rate": 1.9343469756593915e-05,
"loss": 0.7069,
"step": 1425
},
{
"epoch": 0.5032210485217882,
"grad_norm": 2.609375,
"learning_rate": 1.9318749129978225e-05,
"loss": 0.6873,
"step": 1450
},
{
"epoch": 0.5118972734963018,
"grad_norm": 2.734375,
"learning_rate": 1.9293588050655492e-05,
"loss": 0.6733,
"step": 1475
},
{
"epoch": 0.5205734984708154,
"grad_norm": 2.828125,
"learning_rate": 1.9267987707869605e-05,
"loss": 0.6779,
"step": 1500
},
{
"epoch": 0.5205734984708154,
"eval_loss": 0.7879548072814941,
"eval_runtime": 18.8786,
"eval_samples_per_second": 23.307,
"eval_steps_per_second": 23.307,
"step": 1500
},
{
"epoch": 0.529249723445329,
"grad_norm": 3.265625,
"learning_rate": 1.924194931162635e-05,
"loss": 0.6733,
"step": 1525
},
{
"epoch": 0.5379259484198425,
"grad_norm": 2.84375,
"learning_rate": 1.9215474092636187e-05,
"loss": 0.6681,
"step": 1550
},
{
"epoch": 0.5466021733943561,
"grad_norm": 2.859375,
"learning_rate": 1.918856330225611e-05,
"loss": 0.6771,
"step": 1575
},
{
"epoch": 0.5552783983688697,
"grad_norm": 3.296875,
"learning_rate": 1.916121821243049e-05,
"loss": 0.6582,
"step": 1600
},
{
"epoch": 0.5639546233433833,
"grad_norm": 3.015625,
"learning_rate": 1.9133440115630953e-05,
"loss": 0.6551,
"step": 1625
},
{
"epoch": 0.5726308483178969,
"grad_norm": 3.125,
"learning_rate": 1.910523032479529e-05,
"loss": 0.6631,
"step": 1650
},
{
"epoch": 0.5813070732924105,
"grad_norm": 3.0,
"learning_rate": 1.9076590173265406e-05,
"loss": 0.6593,
"step": 1675
},
{
"epoch": 0.5899832982669241,
"grad_norm": 2.84375,
"learning_rate": 1.9047521014724303e-05,
"loss": 0.6439,
"step": 1700
},
{
"epoch": 0.5986595232414377,
"grad_norm": 2.953125,
"learning_rate": 1.9018024223132096e-05,
"loss": 0.6538,
"step": 1725
},
{
"epoch": 0.6073357482159513,
"grad_norm": 3.171875,
"learning_rate": 1.8988101192661057e-05,
"loss": 0.6662,
"step": 1750
},
{
"epoch": 0.6160119731904649,
"grad_norm": 2.796875,
"learning_rate": 1.895775333762974e-05,
"loss": 0.6467,
"step": 1775
},
{
"epoch": 0.6246881981649784,
"grad_norm": 2.796875,
"learning_rate": 1.8926982092436117e-05,
"loss": 0.643,
"step": 1800
},
{
"epoch": 0.6246881981649784,
"eval_loss": 0.7786636352539062,
"eval_runtime": 19.0872,
"eval_samples_per_second": 23.052,
"eval_steps_per_second": 23.052,
"step": 1800
},
{
"epoch": 0.633364423139492,
"grad_norm": 2.84375,
"learning_rate": 1.88957889114898e-05,
"loss": 0.6471,
"step": 1825
},
{
"epoch": 0.6420406481140056,
"grad_norm": 2.53125,
"learning_rate": 1.8864175269143275e-05,
"loss": 0.6413,
"step": 1850
},
{
"epoch": 0.6507168730885192,
"grad_norm": 2.984375,
"learning_rate": 1.8832142659622236e-05,
"loss": 0.6424,
"step": 1875
},
{
"epoch": 0.6593930980630328,
"grad_norm": 3.203125,
"learning_rate": 1.8799692596954947e-05,
"loss": 0.6405,
"step": 1900
},
{
"epoch": 0.6680693230375464,
"grad_norm": 2.6875,
"learning_rate": 1.8766826614900687e-05,
"loss": 0.6307,
"step": 1925
},
{
"epoch": 0.67674554801206,
"grad_norm": 2.828125,
"learning_rate": 1.8733546266877254e-05,
"loss": 0.6151,
"step": 1950
},
{
"epoch": 0.6854217729865736,
"grad_norm": 2.921875,
"learning_rate": 1.8699853125887543e-05,
"loss": 0.6442,
"step": 1975
},
{
"epoch": 0.6940979979610872,
"grad_norm": 2.671875,
"learning_rate": 1.8665748784445206e-05,
"loss": 0.6104,
"step": 2000
},
{
"epoch": 0.7027742229356008,
"grad_norm": 2.8125,
"learning_rate": 1.8631234854499365e-05,
"loss": 0.6213,
"step": 2025
},
{
"epoch": 0.7114504479101144,
"grad_norm": 2.84375,
"learning_rate": 1.8596312967358436e-05,
"loss": 0.6198,
"step": 2050
},
{
"epoch": 0.720126672884628,
"grad_norm": 3.140625,
"learning_rate": 1.856098477361302e-05,
"loss": 0.6263,
"step": 2075
},
{
"epoch": 0.7288028978591415,
"grad_norm": 2.6875,
"learning_rate": 1.8525251943057884e-05,
"loss": 0.6201,
"step": 2100
},
{
"epoch": 0.7288028978591415,
"eval_loss": 0.7636004090309143,
"eval_runtime": 18.8162,
"eval_samples_per_second": 23.384,
"eval_steps_per_second": 23.384,
"step": 2100
},
{
"epoch": 0.7374791228336551,
"grad_norm": 2.953125,
"learning_rate": 1.8489116164613053e-05,
"loss": 0.6182,
"step": 2125
},
{
"epoch": 0.7461553478081686,
"grad_norm": 2.859375,
"learning_rate": 1.845257914624396e-05,
"loss": 0.6252,
"step": 2150
},
{
"epoch": 0.7548315727826822,
"grad_norm": 2.796875,
"learning_rate": 1.841564261488074e-05,
"loss": 0.6067,
"step": 2175
},
{
"epoch": 0.7635077977571958,
"grad_norm": 2.8125,
"learning_rate": 1.8378308316336585e-05,
"loss": 0.6172,
"step": 2200
},
{
"epoch": 0.7721840227317094,
"grad_norm": 3.015625,
"learning_rate": 1.834057801522525e-05,
"loss": 0.6064,
"step": 2225
},
{
"epoch": 0.780860247706223,
"grad_norm": 2.703125,
"learning_rate": 1.8302453494877635e-05,
"loss": 0.6131,
"step": 2250
},
{
"epoch": 0.7895364726807366,
"grad_norm": 2.6875,
"learning_rate": 1.8263936557257496e-05,
"loss": 0.6197,
"step": 2275
},
{
"epoch": 0.7982126976552502,
"grad_norm": 2.796875,
"learning_rate": 1.8225029022876275e-05,
"loss": 0.6128,
"step": 2300
},
{
"epoch": 0.8068889226297637,
"grad_norm": 2.84375,
"learning_rate": 1.818573273070706e-05,
"loss": 0.5884,
"step": 2325
},
{
"epoch": 0.8155651476042773,
"grad_norm": 3.015625,
"learning_rate": 1.8146049538097662e-05,
"loss": 0.6053,
"step": 2350
},
{
"epoch": 0.8242413725787909,
"grad_norm": 2.84375,
"learning_rate": 1.8105981320682815e-05,
"loss": 0.6103,
"step": 2375
},
{
"epoch": 0.8329175975533045,
"grad_norm": 2.546875,
"learning_rate": 1.8065529972295545e-05,
"loss": 0.6053,
"step": 2400
},
{
"epoch": 0.8329175975533045,
"eval_loss": 0.738046407699585,
"eval_runtime": 18.7861,
"eval_samples_per_second": 23.422,
"eval_steps_per_second": 23.422,
"step": 2400
},
{
"epoch": 0.8415938225278181,
"grad_norm": 3.078125,
"learning_rate": 1.802469740487764e-05,
"loss": 0.5852,
"step": 2425
},
{
"epoch": 0.8502700475023317,
"grad_norm": 2.921875,
"learning_rate": 1.7983485548389293e-05,
"loss": 0.5995,
"step": 2450
},
{
"epoch": 0.8589462724768453,
"grad_norm": 2.796875,
"learning_rate": 1.794189635071788e-05,
"loss": 0.5924,
"step": 2475
},
{
"epoch": 0.8676224974513589,
"grad_norm": 2.609375,
"learning_rate": 1.789993177758588e-05,
"loss": 0.5757,
"step": 2500
},
{
"epoch": 0.8762987224258725,
"grad_norm": 2.734375,
"learning_rate": 1.7857593812457985e-05,
"loss": 0.5869,
"step": 2525
},
{
"epoch": 0.8849749474003861,
"grad_norm": 2.875,
"learning_rate": 1.7814884456447337e-05,
"loss": 0.6001,
"step": 2550
},
{
"epoch": 0.8936511723748997,
"grad_norm": 3.0625,
"learning_rate": 1.7771805728220942e-05,
"loss": 0.5996,
"step": 2575
},
{
"epoch": 0.9023273973494133,
"grad_norm": 2.890625,
"learning_rate": 1.772835966390428e-05,
"loss": 0.578,
"step": 2600
},
{
"epoch": 0.9110036223239268,
"grad_norm": 2.734375,
"learning_rate": 1.7684548316985043e-05,
"loss": 0.5959,
"step": 2625
},
{
"epoch": 0.9196798472984404,
"grad_norm": 2.84375,
"learning_rate": 1.7640373758216075e-05,
"loss": 0.5728,
"step": 2650
},
{
"epoch": 0.928356072272954,
"grad_norm": 2.78125,
"learning_rate": 1.7595838075517523e-05,
"loss": 0.5762,
"step": 2675
},
{
"epoch": 0.9370322972474676,
"grad_norm": 2.609375,
"learning_rate": 1.755094337387813e-05,
"loss": 0.5801,
"step": 2700
},
{
"epoch": 0.9370322972474676,
"eval_loss": 0.7330707907676697,
"eval_runtime": 18.9063,
"eval_samples_per_second": 23.273,
"eval_steps_per_second": 23.273,
"step": 2700
},
{
"epoch": 0.9457085222219812,
"grad_norm": 3.265625,
"learning_rate": 1.7505691775255744e-05,
"loss": 0.5517,
"step": 2725
},
{
"epoch": 0.9543847471964948,
"grad_norm": 2.765625,
"learning_rate": 1.7460085418477025e-05,
"loss": 0.5622,
"step": 2750
},
{
"epoch": 0.9630609721710084,
"grad_norm": 2.609375,
"learning_rate": 1.7414126459136365e-05,
"loss": 0.5664,
"step": 2775
},
{
"epoch": 0.971737197145522,
"grad_norm": 2.84375,
"learning_rate": 1.736781706949398e-05,
"loss": 0.5676,
"step": 2800
},
{
"epoch": 0.9804134221200356,
"grad_norm": 2.875,
"learning_rate": 1.732115943837326e-05,
"loss": 0.5925,
"step": 2825
},
{
"epoch": 0.9890896470945492,
"grad_norm": 3.078125,
"learning_rate": 1.7274155771057302e-05,
"loss": 0.5673,
"step": 2850
},
{
"epoch": 0.9977658720690628,
"grad_norm": 2.84375,
"learning_rate": 1.7226808289184673e-05,
"loss": 0.5745,
"step": 2875
},
{
"epoch": 1.0064420970435763,
"grad_norm": 2.578125,
"learning_rate": 1.717911923064442e-05,
"loss": 0.5659,
"step": 2900
},
{
"epoch": 1.0045441728304014,
"grad_norm": 2.640625,
"learning_rate": 1.713109084947028e-05,
"loss": 0.4966,
"step": 2925
},
{
"epoch": 1.013220397804915,
"grad_norm": 2.546875,
"learning_rate": 1.7082725415734145e-05,
"loss": 0.4426,
"step": 2950
},
{
"epoch": 1.0218966227794286,
"grad_norm": 2.546875,
"learning_rate": 1.7034025215438776e-05,
"loss": 0.4382,
"step": 2975
},
{
"epoch": 1.0305728477539422,
"grad_norm": 2.640625,
"learning_rate": 1.6984992550409747e-05,
"loss": 0.4414,
"step": 3000
},
{
"epoch": 1.0305728477539422,
"eval_loss": 0.7384564280509949,
"eval_runtime": 18.8395,
"eval_samples_per_second": 23.355,
"eval_steps_per_second": 23.355,
"step": 3000
},
{
"epoch": 1.0392490727284558,
"grad_norm": 2.890625,
"learning_rate": 1.6935629738186646e-05,
"loss": 0.4454,
"step": 3025
},
{
"epoch": 1.0479252977029694,
"grad_norm": 2.84375,
"learning_rate": 1.6885939111913544e-05,
"loss": 0.4334,
"step": 3050
},
{
"epoch": 1.056601522677483,
"grad_norm": 2.8125,
"learning_rate": 1.6835923020228714e-05,
"loss": 0.4293,
"step": 3075
},
{
"epoch": 1.0652777476519966,
"grad_norm": 2.5,
"learning_rate": 1.678558382715362e-05,
"loss": 0.4502,
"step": 3100
},
{
"epoch": 1.0739539726265102,
"grad_norm": 2.90625,
"learning_rate": 1.6734923911981188e-05,
"loss": 0.437,
"step": 3125
},
{
"epoch": 1.0826301976010237,
"grad_norm": 2.78125,
"learning_rate": 1.668394566916334e-05,
"loss": 0.4442,
"step": 3150
},
{
"epoch": 1.0913064225755373,
"grad_norm": 2.546875,
"learning_rate": 1.6632651508197827e-05,
"loss": 0.4448,
"step": 3175
},
{
"epoch": 1.099982647550051,
"grad_norm": 2.703125,
"learning_rate": 1.6581043853514335e-05,
"loss": 0.4358,
"step": 3200
},
{
"epoch": 1.1086588725245645,
"grad_norm": 2.765625,
"learning_rate": 1.6529125144359902e-05,
"loss": 0.4561,
"step": 3225
},
{
"epoch": 1.1173350974990781,
"grad_norm": 2.8125,
"learning_rate": 1.647689783468362e-05,
"loss": 0.4294,
"step": 3250
},
{
"epoch": 1.1260113224735917,
"grad_norm": 2.9375,
"learning_rate": 1.642436439302066e-05,
"loss": 0.4316,
"step": 3275
},
{
"epoch": 1.1346875474481053,
"grad_norm": 2.71875,
"learning_rate": 1.637152730237558e-05,
"loss": 0.4455,
"step": 3300
},
{
"epoch": 1.1346875474481053,
"eval_loss": 0.7258099913597107,
"eval_runtime": 18.9376,
"eval_samples_per_second": 23.234,
"eval_steps_per_second": 23.234,
"step": 3300
},
{
"epoch": 1.1433637724226189,
"grad_norm": 2.984375,
"learning_rate": 1.631838906010498e-05,
"loss": 0.4332,
"step": 3325
},
{
"epoch": 1.1520399973971325,
"grad_norm": 2.828125,
"learning_rate": 1.6264952177799446e-05,
"loss": 0.4303,
"step": 3350
},
{
"epoch": 1.160716222371646,
"grad_norm": 2.8125,
"learning_rate": 1.6211219181164864e-05,
"loss": 0.4498,
"step": 3375
},
{
"epoch": 1.1693924473461597,
"grad_norm": 2.71875,
"learning_rate": 1.6157192609903017e-05,
"loss": 0.445,
"step": 3400
},
{
"epoch": 1.1780686723206732,
"grad_norm": 3.015625,
"learning_rate": 1.6102875017591566e-05,
"loss": 0.4471,
"step": 3425
},
{
"epoch": 1.1867448972951868,
"grad_norm": 3.109375,
"learning_rate": 1.6048268971563337e-05,
"loss": 0.4449,
"step": 3450
},
{
"epoch": 1.1954211222697004,
"grad_norm": 2.53125,
"learning_rate": 1.5993377052784988e-05,
"loss": 0.4333,
"step": 3475
},
{
"epoch": 1.204097347244214,
"grad_norm": 3.015625,
"learning_rate": 1.5938201855735017e-05,
"loss": 0.4307,
"step": 3500
},
{
"epoch": 1.2127735722187276,
"grad_norm": 2.6875,
"learning_rate": 1.588274598828113e-05,
"loss": 0.4251,
"step": 3525
},
{
"epoch": 1.2214497971932412,
"grad_norm": 3.0625,
"learning_rate": 1.582701207155697e-05,
"loss": 0.4227,
"step": 3550
},
{
"epoch": 1.2301260221677548,
"grad_norm": 2.875,
"learning_rate": 1.577100273983826e-05,
"loss": 0.4401,
"step": 3575
},
{
"epoch": 1.2388022471422684,
"grad_norm": 2.9375,
"learning_rate": 1.5714720640418252e-05,
"loss": 0.4333,
"step": 3600
},
{
"epoch": 1.2388022471422684,
"eval_loss": 0.713193416595459,
"eval_runtime": 18.8026,
"eval_samples_per_second": 23.401,
"eval_steps_per_second": 23.401,
"step": 3600
},
{
"epoch": 1.247478472116782,
"grad_norm": 2.75,
"learning_rate": 1.5658168433482637e-05,
"loss": 0.432,
"step": 3625
},
{
"epoch": 1.2561546970912956,
"grad_norm": 2.84375,
"learning_rate": 1.560134879198379e-05,
"loss": 0.429,
"step": 3650
},
{
"epoch": 1.2648309220658092,
"grad_norm": 3.015625,
"learning_rate": 1.554426440151444e-05,
"loss": 0.4378,
"step": 3675
},
{
"epoch": 1.2735071470403228,
"grad_norm": 2.703125,
"learning_rate": 1.5486917960180742e-05,
"loss": 0.4278,
"step": 3700
},
{
"epoch": 1.2821833720148363,
"grad_norm": 2.625,
"learning_rate": 1.542931217847472e-05,
"loss": 0.429,
"step": 3725
},
{
"epoch": 1.29085959698935,
"grad_norm": 2.921875,
"learning_rate": 1.5371449779146205e-05,
"loss": 0.4289,
"step": 3750
},
{
"epoch": 1.2995358219638635,
"grad_norm": 2.671875,
"learning_rate": 1.5313333497074094e-05,
"loss": 0.4271,
"step": 3775
},
{
"epoch": 1.3082120469383771,
"grad_norm": 2.546875,
"learning_rate": 1.5254966079137118e-05,
"loss": 0.4239,
"step": 3800
},
{
"epoch": 1.3168882719128907,
"grad_norm": 2.953125,
"learning_rate": 1.5196350284083999e-05,
"loss": 0.4291,
"step": 3825
},
{
"epoch": 1.3255644968874043,
"grad_norm": 2.796875,
"learning_rate": 1.513748888240305e-05,
"loss": 0.429,
"step": 3850
},
{
"epoch": 1.3342407218619179,
"grad_norm": 3.125,
"learning_rate": 1.507838465619125e-05,
"loss": 0.4232,
"step": 3875
},
{
"epoch": 1.3429169468364315,
"grad_norm": 2.578125,
"learning_rate": 1.5019040399022711e-05,
"loss": 0.4237,
"step": 3900
},
{
"epoch": 1.3429169468364315,
"eval_loss": 0.7250744700431824,
"eval_runtime": 18.82,
"eval_samples_per_second": 23.379,
"eval_steps_per_second": 23.379,
"step": 3900
},
{
"epoch": 1.351593171810945,
"grad_norm": 3.296875,
"learning_rate": 1.4959458915816681e-05,
"loss": 0.4297,
"step": 3925
},
{
"epoch": 1.3602693967854587,
"grad_norm": 2.875,
"learning_rate": 1.489964302270493e-05,
"loss": 0.4331,
"step": 3950
},
{
"epoch": 1.3689456217599723,
"grad_norm": 2.734375,
"learning_rate": 1.483959554689868e-05,
"loss": 0.43,
"step": 3975
},
{
"epoch": 1.3776218467344858,
"grad_norm": 2.796875,
"learning_rate": 1.4779319326554953e-05,
"loss": 0.4165,
"step": 4000
},
{
"epoch": 1.3862980717089994,
"grad_norm": 2.84375,
"learning_rate": 1.4718817210642427e-05,
"loss": 0.4325,
"step": 4025
},
{
"epoch": 1.394974296683513,
"grad_norm": 2.921875,
"learning_rate": 1.4658092058806783e-05,
"loss": 0.4225,
"step": 4050
},
{
"epoch": 1.4036505216580266,
"grad_norm": 3.125,
"learning_rate": 1.4597146741235554e-05,
"loss": 0.4137,
"step": 4075
},
{
"epoch": 1.4123267466325402,
"grad_norm": 2.75,
"learning_rate": 1.4535984138522442e-05,
"loss": 0.4075,
"step": 4100
},
{
"epoch": 1.4210029716070538,
"grad_norm": 2.796875,
"learning_rate": 1.447460714153119e-05,
"loss": 0.4228,
"step": 4125
},
{
"epoch": 1.4296791965815674,
"grad_norm": 2.90625,
"learning_rate": 1.4413018651258922e-05,
"loss": 0.4215,
"step": 4150
},
{
"epoch": 1.438355421556081,
"grad_norm": 2.984375,
"learning_rate": 1.4351221578699045e-05,
"loss": 0.4203,
"step": 4175
},
{
"epoch": 1.4470316465305946,
"grad_norm": 2.8125,
"learning_rate": 1.4289218844703654e-05,
"loss": 0.4068,
"step": 4200
},
{
"epoch": 1.4470316465305946,
"eval_loss": 0.7109268307685852,
"eval_runtime": 19.0434,
"eval_samples_per_second": 23.105,
"eval_steps_per_second": 23.105,
"step": 4200
},
{
"epoch": 1.4557078715051082,
"grad_norm": 3.171875,
"learning_rate": 1.4227013379845471e-05,
"loss": 0.4169,
"step": 4225
},
{
"epoch": 1.4643840964796218,
"grad_norm": 2.75,
"learning_rate": 1.4164608124279337e-05,
"loss": 0.407,
"step": 4250
},
{
"epoch": 1.4730603214541353,
"grad_norm": 2.671875,
"learning_rate": 1.4102006027603255e-05,
"loss": 0.4349,
"step": 4275
},
{
"epoch": 1.481736546428649,
"grad_norm": 3.125,
"learning_rate": 1.403921004871895e-05,
"loss": 0.4077,
"step": 4300
},
{
"epoch": 1.4904127714031625,
"grad_norm": 3.140625,
"learning_rate": 1.3976223155692047e-05,
"loss": 0.4234,
"step": 4325
},
{
"epoch": 1.4990889963776761,
"grad_norm": 2.9375,
"learning_rate": 1.391304832561175e-05,
"loss": 0.4177,
"step": 4350
},
{
"epoch": 1.5077652213521897,
"grad_norm": 2.90625,
"learning_rate": 1.3849688544450176e-05,
"loss": 0.4027,
"step": 4375
},
{
"epoch": 1.5164414463267033,
"grad_norm": 2.765625,
"learning_rate": 1.3786146806921166e-05,
"loss": 0.4125,
"step": 4400
},
{
"epoch": 1.525117671301217,
"grad_norm": 2.9375,
"learning_rate": 1.3722426116338792e-05,
"loss": 0.4019,
"step": 4425
},
{
"epoch": 1.5337938962757305,
"grad_norm": 2.671875,
"learning_rate": 1.3658529484475369e-05,
"loss": 0.4175,
"step": 4450
},
{
"epoch": 1.5424701212502439,
"grad_norm": 2.796875,
"learning_rate": 1.3594459931419112e-05,
"loss": 0.4136,
"step": 4475
},
{
"epoch": 1.5511463462247574,
"grad_norm": 2.71875,
"learning_rate": 1.3530220485431405e-05,
"loss": 0.3997,
"step": 4500
},
{
"epoch": 1.5511463462247574,
"eval_loss": 0.706421434879303,
"eval_runtime": 19.0141,
"eval_samples_per_second": 23.141,
"eval_steps_per_second": 23.141,
"step": 4500
},
{
"epoch": 1.559822571199271,
"grad_norm": 2.875,
"learning_rate": 1.3465814182803653e-05,
"loss": 0.422,
"step": 4525
},
{
"epoch": 1.5684987961737846,
"grad_norm": 2.875,
"learning_rate": 1.340124406771377e-05,
"loss": 0.4171,
"step": 4550
},
{
"epoch": 1.5771750211482982,
"grad_norm": 2.984375,
"learning_rate": 1.3336513192082316e-05,
"loss": 0.4085,
"step": 4575
},
{
"epoch": 1.5858512461228118,
"grad_norm": 2.90625,
"learning_rate": 1.3271624615428218e-05,
"loss": 0.4088,
"step": 4600
},
{
"epoch": 1.5945274710973254,
"grad_norm": 2.6875,
"learning_rate": 1.3206581404724185e-05,
"loss": 0.3976,
"step": 4625
},
{
"epoch": 1.603203696071839,
"grad_norm": 3.34375,
"learning_rate": 1.3141386634251736e-05,
"loss": 0.404,
"step": 4650
},
{
"epoch": 1.6118799210463526,
"grad_norm": 2.78125,
"learning_rate": 1.3076043385455894e-05,
"loss": 0.4128,
"step": 4675
},
{
"epoch": 1.6205561460208662,
"grad_norm": 2.890625,
"learning_rate": 1.3010554746799544e-05,
"loss": 0.3959,
"step": 4700
},
{
"epoch": 1.6292323709953798,
"grad_norm": 2.8125,
"learning_rate": 1.2944923813617458e-05,
"loss": 0.3978,
"step": 4725
},
{
"epoch": 1.6379085959698934,
"grad_norm": 2.859375,
"learning_rate": 1.2879153687969984e-05,
"loss": 0.4009,
"step": 4750
},
{
"epoch": 1.646584820944407,
"grad_norm": 3.34375,
"learning_rate": 1.2813247478496428e-05,
"loss": 0.3974,
"step": 4775
},
{
"epoch": 1.6552610459189205,
"grad_norm": 3.03125,
"learning_rate": 1.274720830026814e-05,
"loss": 0.3967,
"step": 4800
},
{
"epoch": 1.6552610459189205,
"eval_loss": 0.7064741253852844,
"eval_runtime": 19.1066,
"eval_samples_per_second": 23.029,
"eval_steps_per_second": 23.029,
"step": 4800
},
{
"epoch": 1.6639372708934341,
"grad_norm": 2.65625,
"learning_rate": 1.2681039274641261e-05,
"loss": 0.4103,
"step": 4825
},
{
"epoch": 1.6726134958679477,
"grad_norm": 2.890625,
"learning_rate": 1.261474352910919e-05,
"loss": 0.4044,
"step": 4850
},
{
"epoch": 1.6812897208424613,
"grad_norm": 2.828125,
"learning_rate": 1.2548324197154788e-05,
"loss": 0.3968,
"step": 4875
},
{
"epoch": 1.689965945816975,
"grad_norm": 2.75,
"learning_rate": 1.248178441810224e-05,
"loss": 0.3955,
"step": 4900
},
{
"epoch": 1.6986421707914885,
"grad_norm": 2.6875,
"learning_rate": 1.2415127336968691e-05,
"loss": 0.3903,
"step": 4925
},
{
"epoch": 1.707318395766002,
"grad_norm": 3.15625,
"learning_rate": 1.23483561043156e-05,
"loss": 0.3897,
"step": 4950
},
{
"epoch": 1.7159946207405157,
"grad_norm": 2.796875,
"learning_rate": 1.2281473876099822e-05,
"loss": 0.3981,
"step": 4975
},
{
"epoch": 1.7246708457150293,
"grad_norm": 2.953125,
"learning_rate": 1.2214483813524429e-05,
"loss": 0.4172,
"step": 5000
},
{
"epoch": 1.7333470706895429,
"grad_norm": 2.875,
"learning_rate": 1.2147389082889328e-05,
"loss": 0.398,
"step": 5025
},
{
"epoch": 1.7420232956640564,
"grad_norm": 2.921875,
"learning_rate": 1.2080192855441572e-05,
"loss": 0.3901,
"step": 5050
},
{
"epoch": 1.75069952063857,
"grad_norm": 2.84375,
"learning_rate": 1.2012898307225482e-05,
"loss": 0.3865,
"step": 5075
},
{
"epoch": 1.7593757456130836,
"grad_norm": 3.203125,
"learning_rate": 1.1945508618932537e-05,
"loss": 0.3904,
"step": 5100
},
{
"epoch": 1.7593757456130836,
"eval_loss": 0.707400918006897,
"eval_runtime": 18.7714,
"eval_samples_per_second": 23.44,
"eval_steps_per_second": 23.44,
"step": 5100
},
{
"epoch": 1.7680519705875972,
"grad_norm": 2.5625,
"learning_rate": 1.1878026975751033e-05,
"loss": 0.3987,
"step": 5125
},
{
"epoch": 1.7767281955621108,
"grad_norm": 3.109375,
"learning_rate": 1.1810456567215525e-05,
"loss": 0.3977,
"step": 5150
},
{
"epoch": 1.7854044205366244,
"grad_norm": 2.984375,
"learning_rate": 1.1742800587056092e-05,
"loss": 0.3913,
"step": 5175
},
{
"epoch": 1.794080645511138,
"grad_norm": 2.984375,
"learning_rate": 1.1675062233047365e-05,
"loss": 0.3835,
"step": 5200
},
{
"epoch": 1.8027568704856516,
"grad_norm": 2.96875,
"learning_rate": 1.1607244706857404e-05,
"loss": 0.3856,
"step": 5225
},
{
"epoch": 1.8114330954601652,
"grad_norm": 2.65625,
"learning_rate": 1.1539351213896352e-05,
"loss": 0.3835,
"step": 5250
},
{
"epoch": 1.8201093204346788,
"grad_norm": 2.8125,
"learning_rate": 1.147138496316494e-05,
"loss": 0.3901,
"step": 5275
},
{
"epoch": 1.8287855454091924,
"grad_norm": 3.0,
"learning_rate": 1.1403349167102806e-05,
"loss": 0.3953,
"step": 5300
},
{
"epoch": 1.837461770383706,
"grad_norm": 3.234375,
"learning_rate": 1.1335247041436674e-05,
"loss": 0.3911,
"step": 5325
},
{
"epoch": 1.8461379953582195,
"grad_norm": 2.875,
"learning_rate": 1.126708180502834e-05,
"loss": 0.3765,
"step": 5350
},
{
"epoch": 1.8548142203327331,
"grad_norm": 2.9375,
"learning_rate": 1.1198856679722548e-05,
"loss": 0.3862,
"step": 5375
},
{
"epoch": 1.8634904453072467,
"grad_norm": 2.75,
"learning_rate": 1.1130574890194706e-05,
"loss": 0.3838,
"step": 5400
},
{
"epoch": 1.8634904453072467,
"eval_loss": 0.7026991248130798,
"eval_runtime": 18.7531,
"eval_samples_per_second": 23.463,
"eval_steps_per_second": 23.463,
"step": 5400
},
{
"epoch": 1.8721666702817603,
"grad_norm": 2.828125,
"learning_rate": 1.1062239663798466e-05,
"loss": 0.3843,
"step": 5425
},
{
"epoch": 1.880842895256274,
"grad_norm": 2.78125,
"learning_rate": 1.0993854230413183e-05,
"loss": 0.3971,
"step": 5450
},
{
"epoch": 1.8895191202307875,
"grad_norm": 3.0625,
"learning_rate": 1.092542182229126e-05,
"loss": 0.378,
"step": 5475
},
{
"epoch": 1.898195345205301,
"grad_norm": 2.953125,
"learning_rate": 1.085694567390537e-05,
"loss": 0.3764,
"step": 5500
},
{
"epoch": 1.9068715701798147,
"grad_norm": 2.96875,
"learning_rate": 1.0788429021795582e-05,
"loss": 0.3705,
"step": 5525
},
{
"epoch": 1.9155477951543283,
"grad_norm": 2.859375,
"learning_rate": 1.0719875104416373e-05,
"loss": 0.3723,
"step": 5550
},
{
"epoch": 1.9242240201288419,
"grad_norm": 2.875,
"learning_rate": 1.0651287161983583e-05,
"loss": 0.3778,
"step": 5575
},
{
"epoch": 1.9329002451033555,
"grad_norm": 2.796875,
"learning_rate": 1.0582668436321244e-05,
"loss": 0.3773,
"step": 5600
},
{
"epoch": 1.941576470077869,
"grad_norm": 3.140625,
"learning_rate": 1.0514022170708374e-05,
"loss": 0.3662,
"step": 5625
},
{
"epoch": 1.9502526950523826,
"grad_norm": 3.1875,
"learning_rate": 1.044535160972566e-05,
"loss": 0.3777,
"step": 5650
},
{
"epoch": 1.9589289200268962,
"grad_norm": 3.078125,
"learning_rate": 1.0376659999102125e-05,
"loss": 0.3775,
"step": 5675
},
{
"epoch": 1.9676051450014098,
"grad_norm": 2.6875,
"learning_rate": 1.0307950585561705e-05,
"loss": 0.3714,
"step": 5700
},
{
"epoch": 1.9676051450014098,
"eval_loss": 0.7053300142288208,
"eval_runtime": 19.1384,
"eval_samples_per_second": 22.99,
"eval_steps_per_second": 22.99,
"step": 5700
},
{
"epoch": 1.9762813699759234,
"grad_norm": 2.875,
"learning_rate": 1.0239226616669792e-05,
"loss": 0.375,
"step": 5725
},
{
"epoch": 1.984957594950437,
"grad_norm": 2.765625,
"learning_rate": 1.0170491340679744e-05,
"loss": 0.3704,
"step": 5750
}
],
"logging_steps": 25,
"max_steps": 11524,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 1441,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.495938899681739e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}