|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9850746268656714, |
|
"eval_steps": 500, |
|
"global_step": 75, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03980099502487562, |
|
"grad_norm": 0.49672797322273254, |
|
"learning_rate": 1.25e-05, |
|
"loss": 1.634, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.07960199004975124, |
|
"grad_norm": 0.4738655984401703, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.6666, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.11940298507462686, |
|
"grad_norm": 0.4305928945541382, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 1.6673, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.15920398009950248, |
|
"grad_norm": 0.2917488217353821, |
|
"learning_rate": 5e-05, |
|
"loss": 1.5859, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.19900497512437812, |
|
"grad_norm": 0.3740411102771759, |
|
"learning_rate": 6.25e-05, |
|
"loss": 1.641, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.23880597014925373, |
|
"grad_norm": 0.3605235815048218, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 1.5957, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.27860696517412936, |
|
"grad_norm": 0.2877049744129181, |
|
"learning_rate": 8.75e-05, |
|
"loss": 1.5118, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.31840796019900497, |
|
"grad_norm": 0.2710678279399872, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5322, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.3582089552238806, |
|
"grad_norm": 0.26977717876434326, |
|
"learning_rate": 9.994504457428558e-05, |
|
"loss": 1.5521, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.39800995024875624, |
|
"grad_norm": 0.2279333472251892, |
|
"learning_rate": 9.978029910109491e-05, |
|
"loss": 1.5501, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.43781094527363185, |
|
"grad_norm": 0.22376185655593872, |
|
"learning_rate": 9.950612572673255e-05, |
|
"loss": 1.5476, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.47761194029850745, |
|
"grad_norm": 0.22192299365997314, |
|
"learning_rate": 9.91231271437788e-05, |
|
"loss": 1.5191, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.5174129353233831, |
|
"grad_norm": 0.20543566346168518, |
|
"learning_rate": 9.863214526624065e-05, |
|
"loss": 1.5528, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.5572139303482587, |
|
"grad_norm": 0.20152153074741364, |
|
"learning_rate": 9.8034259378842e-05, |
|
"loss": 1.4816, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.5970149253731343, |
|
"grad_norm": 0.204048290848732, |
|
"learning_rate": 9.733078376452171e-05, |
|
"loss": 1.5519, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.6368159203980099, |
|
"grad_norm": 0.19486752152442932, |
|
"learning_rate": 9.652326481535435e-05, |
|
"loss": 1.5183, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.6766169154228856, |
|
"grad_norm": 0.1839868277311325, |
|
"learning_rate": 9.561347763324484e-05, |
|
"loss": 1.5272, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.7164179104477612, |
|
"grad_norm": 0.1841403841972351, |
|
"learning_rate": 9.460342212786932e-05, |
|
"loss": 1.523, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.7562189054726368, |
|
"grad_norm": 0.17461948096752167, |
|
"learning_rate": 9.349531862043952e-05, |
|
"loss": 1.5316, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.7960199004975125, |
|
"grad_norm": 0.1808539479970932, |
|
"learning_rate": 9.229160296295488e-05, |
|
"loss": 1.4991, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.835820895522388, |
|
"grad_norm": 0.1901710480451584, |
|
"learning_rate": 9.099492118367123e-05, |
|
"loss": 1.5086, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.8756218905472637, |
|
"grad_norm": 0.183896005153656, |
|
"learning_rate": 8.960812367055646e-05, |
|
"loss": 1.5435, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.9154228855721394, |
|
"grad_norm": 0.17992867529392242, |
|
"learning_rate": 8.81342589055191e-05, |
|
"loss": 1.5201, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.9552238805970149, |
|
"grad_norm": 0.17739859223365784, |
|
"learning_rate": 8.657656676318346e-05, |
|
"loss": 1.5059, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.9950248756218906, |
|
"grad_norm": 0.17343272268772125, |
|
"learning_rate": 8.493847138894209e-05, |
|
"loss": 1.5023, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 1.0348258706467661, |
|
"grad_norm": 0.181735098361969, |
|
"learning_rate": 8.322357367194109e-05, |
|
"loss": 1.4237, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 1.0746268656716418, |
|
"grad_norm": 0.1831899732351303, |
|
"learning_rate": 8.143564332954425e-05, |
|
"loss": 1.4127, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 1.1144278606965174, |
|
"grad_norm": 0.17571744322776794, |
|
"learning_rate": 7.957861062067614e-05, |
|
"loss": 1.415, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.154228855721393, |
|
"grad_norm": 0.17378666996955872, |
|
"learning_rate": 7.765655770625997e-05, |
|
"loss": 1.3883, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 1.1940298507462686, |
|
"grad_norm": 0.18887346982955933, |
|
"learning_rate": 7.56737096757421e-05, |
|
"loss": 1.3688, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.2338308457711442, |
|
"grad_norm": 0.1842719465494156, |
|
"learning_rate": 7.363442525942826e-05, |
|
"loss": 1.3973, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.2736318407960199, |
|
"grad_norm": 0.180495485663414, |
|
"learning_rate": 7.154318724704853e-05, |
|
"loss": 1.4196, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.3134328358208955, |
|
"grad_norm": 0.18591800332069397, |
|
"learning_rate": 6.940459263361249e-05, |
|
"loss": 1.3993, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.3532338308457712, |
|
"grad_norm": 0.1974848359823227, |
|
"learning_rate": 6.722334251421665e-05, |
|
"loss": 1.4004, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.3930348258706466, |
|
"grad_norm": 0.18464985489845276, |
|
"learning_rate": 6.500423175001705e-05, |
|
"loss": 1.4223, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.4328358208955223, |
|
"grad_norm": 0.19280724227428436, |
|
"learning_rate": 6.275213842808383e-05, |
|
"loss": 1.4227, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.472636815920398, |
|
"grad_norm": 0.2069428563117981, |
|
"learning_rate": 6.0472013138307235e-05, |
|
"loss": 1.4064, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.5124378109452736, |
|
"grad_norm": 0.19798892736434937, |
|
"learning_rate": 5.816886809092651e-05, |
|
"loss": 1.4589, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.5522388059701493, |
|
"grad_norm": 0.19987910985946655, |
|
"learning_rate": 5.584776609860414e-05, |
|
"loss": 1.3616, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.5920398009950247, |
|
"grad_norm": 0.19689033925533295, |
|
"learning_rate": 5.351380944726465e-05, |
|
"loss": 1.402, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.6318407960199006, |
|
"grad_norm": 0.20285791158676147, |
|
"learning_rate": 5.117212868016303e-05, |
|
"loss": 1.4233, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.671641791044776, |
|
"grad_norm": 0.20456896722316742, |
|
"learning_rate": 4.882787131983698e-05, |
|
"loss": 1.3909, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.7114427860696517, |
|
"grad_norm": 0.1989758014678955, |
|
"learning_rate": 4.648619055273537e-05, |
|
"loss": 1.4099, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.7512437810945274, |
|
"grad_norm": 0.20214244723320007, |
|
"learning_rate": 4.415223390139588e-05, |
|
"loss": 1.4281, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.7910447761194028, |
|
"grad_norm": 0.20304188132286072, |
|
"learning_rate": 4.183113190907349e-05, |
|
"loss": 1.3892, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.8308457711442787, |
|
"grad_norm": 0.20430393517017365, |
|
"learning_rate": 3.952798686169279e-05, |
|
"loss": 1.3681, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.8706467661691542, |
|
"grad_norm": 0.216043621301651, |
|
"learning_rate": 3.7247861571916185e-05, |
|
"loss": 1.4124, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.9104477611940298, |
|
"grad_norm": 0.20856983959674835, |
|
"learning_rate": 3.499576824998298e-05, |
|
"loss": 1.3772, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.9502487562189055, |
|
"grad_norm": 0.21441003680229187, |
|
"learning_rate": 3.277665748578336e-05, |
|
"loss": 1.3912, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.9900497512437811, |
|
"grad_norm": 0.2081148624420166, |
|
"learning_rate": 3.0595407366387504e-05, |
|
"loss": 1.4066, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 2.029850746268657, |
|
"grad_norm": 0.21571974456310272, |
|
"learning_rate": 2.8456812752951485e-05, |
|
"loss": 1.2869, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 2.0696517412935322, |
|
"grad_norm": 0.22047464549541473, |
|
"learning_rate": 2.636557474057173e-05, |
|
"loss": 1.3308, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 2.109452736318408, |
|
"grad_norm": 0.21658696234226227, |
|
"learning_rate": 2.4326290324257894e-05, |
|
"loss": 1.3483, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 2.1492537313432836, |
|
"grad_norm": 0.22115129232406616, |
|
"learning_rate": 2.234344229374003e-05, |
|
"loss": 1.3131, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 2.189054726368159, |
|
"grad_norm": 0.21616297960281372, |
|
"learning_rate": 2.042138937932388e-05, |
|
"loss": 1.32, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 2.228855721393035, |
|
"grad_norm": 0.2277287244796753, |
|
"learning_rate": 1.8564356670455767e-05, |
|
"loss": 1.3285, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 2.2686567164179103, |
|
"grad_norm": 0.21879006922245026, |
|
"learning_rate": 1.677642632805892e-05, |
|
"loss": 1.3478, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 2.308457711442786, |
|
"grad_norm": 0.2215149849653244, |
|
"learning_rate": 1.5061528611057918e-05, |
|
"loss": 1.3001, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 2.3482587064676617, |
|
"grad_norm": 0.2294432371854782, |
|
"learning_rate": 1.3423433236816563e-05, |
|
"loss": 1.288, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 2.388059701492537, |
|
"grad_norm": 0.22794978320598602, |
|
"learning_rate": 1.1865741094480909e-05, |
|
"loss": 1.2953, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.427860696517413, |
|
"grad_norm": 0.22905172407627106, |
|
"learning_rate": 1.0391876329443533e-05, |
|
"loss": 1.3366, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 2.4676616915422884, |
|
"grad_norm": 0.2197260707616806, |
|
"learning_rate": 9.005078816328771e-06, |
|
"loss": 1.3045, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 2.5074626865671643, |
|
"grad_norm": 0.23517443239688873, |
|
"learning_rate": 7.708397037045129e-06, |
|
"loss": 1.2732, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.5472636815920398, |
|
"grad_norm": 0.22419849038124084, |
|
"learning_rate": 6.50468137956049e-06, |
|
"loss": 1.3236, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.587064676616915, |
|
"grad_norm": 0.22183862328529358, |
|
"learning_rate": 5.3965778721306755e-06, |
|
"loss": 1.3171, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.626865671641791, |
|
"grad_norm": 0.2313942015171051, |
|
"learning_rate": 4.386522366755169e-06, |
|
"loss": 1.3622, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 0.22842709720134735, |
|
"learning_rate": 3.476735184645674e-06, |
|
"loss": 1.3202, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.7064676616915424, |
|
"grad_norm": 0.21870732307434082, |
|
"learning_rate": 2.6692162354782944e-06, |
|
"loss": 1.3243, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.746268656716418, |
|
"grad_norm": 0.2380901575088501, |
|
"learning_rate": 1.9657406211579966e-06, |
|
"loss": 1.2638, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.7860696517412933, |
|
"grad_norm": 0.2310485988855362, |
|
"learning_rate": 1.3678547337593494e-06, |
|
"loss": 1.3266, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.825870646766169, |
|
"grad_norm": 0.22601790726184845, |
|
"learning_rate": 8.768728562211947e-07, |
|
"loss": 1.3183, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.8656716417910446, |
|
"grad_norm": 0.22665373980998993, |
|
"learning_rate": 4.938742732674529e-07, |
|
"loss": 1.3247, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.9054726368159205, |
|
"grad_norm": 0.22332097589969635, |
|
"learning_rate": 2.1970089890509527e-07, |
|
"loss": 1.3518, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.945273631840796, |
|
"grad_norm": 0.2267157882452011, |
|
"learning_rate": 5.4955425714431353e-08, |
|
"loss": 1.2951, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.9850746268656714, |
|
"grad_norm": 0.23764650523662567, |
|
"learning_rate": 0.0, |
|
"loss": 1.3156, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.9850746268656714, |
|
"step": 75, |
|
"total_flos": 1.7993258830774927e+18, |
|
"train_loss": 1.423751606941223, |
|
"train_runtime": 7978.1931, |
|
"train_samples_per_second": 0.604, |
|
"train_steps_per_second": 0.009 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 75, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.7993258830774927e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|