|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 200, |
|
"global_step": 134, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.014925373134328358, |
|
"grad_norm": 2.4583136454633454, |
|
"learning_rate": 9.99862592554908e-06, |
|
"loss": 0.1709, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.029850746268656716, |
|
"grad_norm": 1.2601474874951664, |
|
"learning_rate": 9.994504457428557e-06, |
|
"loss": 0.1095, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.04477611940298507, |
|
"grad_norm": 1.8921245514823541, |
|
"learning_rate": 9.987637860920053e-06, |
|
"loss": 0.1123, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.05970149253731343, |
|
"grad_norm": 1.0381477254848812, |
|
"learning_rate": 9.978029910109491e-06, |
|
"loss": 0.0897, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.07462686567164178, |
|
"grad_norm": 0.8231161139163885, |
|
"learning_rate": 9.965685885812773e-06, |
|
"loss": 0.0804, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.08955223880597014, |
|
"grad_norm": 3.3306861693595704, |
|
"learning_rate": 9.950612572673255e-06, |
|
"loss": 0.1197, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.1044776119402985, |
|
"grad_norm": 0.8873942793156309, |
|
"learning_rate": 9.932818255432733e-06, |
|
"loss": 0.1052, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.11940298507462686, |
|
"grad_norm": 0.7726110244054883, |
|
"learning_rate": 9.91231271437788e-06, |
|
"loss": 0.09, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.13432835820895522, |
|
"grad_norm": 0.7713735038013977, |
|
"learning_rate": 9.889107219964726e-06, |
|
"loss": 0.0847, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.14925373134328357, |
|
"grad_norm": 0.8179878874298719, |
|
"learning_rate": 9.863214526624065e-06, |
|
"loss": 0.0899, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.16417910447761194, |
|
"grad_norm": 0.793854117532711, |
|
"learning_rate": 9.834648865751254e-06, |
|
"loss": 0.0885, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.1791044776119403, |
|
"grad_norm": 0.6649283964833135, |
|
"learning_rate": 9.803425937884202e-06, |
|
"loss": 0.078, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.19402985074626866, |
|
"grad_norm": 0.8490726920421898, |
|
"learning_rate": 9.769562904073896e-06, |
|
"loss": 0.0878, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.208955223880597, |
|
"grad_norm": 0.7411456396869534, |
|
"learning_rate": 9.733078376452172e-06, |
|
"loss": 0.0881, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.22388059701492538, |
|
"grad_norm": 0.7448122603540034, |
|
"learning_rate": 9.693992408001934e-06, |
|
"loss": 0.091, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.23880597014925373, |
|
"grad_norm": 0.672661340651816, |
|
"learning_rate": 9.652326481535434e-06, |
|
"loss": 0.0847, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.2537313432835821, |
|
"grad_norm": 0.6295351920280576, |
|
"learning_rate": 9.608103497886687e-06, |
|
"loss": 0.0751, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.26865671641791045, |
|
"grad_norm": 0.5987419993650692, |
|
"learning_rate": 9.561347763324484e-06, |
|
"loss": 0.0757, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.2835820895522388, |
|
"grad_norm": 0.6862099670787631, |
|
"learning_rate": 9.512084976192944e-06, |
|
"loss": 0.0832, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.29850746268656714, |
|
"grad_norm": 0.6474317718234073, |
|
"learning_rate": 9.460342212786933e-06, |
|
"loss": 0.0785, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.31343283582089554, |
|
"grad_norm": 0.6520670338068375, |
|
"learning_rate": 9.406147912470142e-06, |
|
"loss": 0.0832, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.3283582089552239, |
|
"grad_norm": 0.6875547799836488, |
|
"learning_rate": 9.349531862043952e-06, |
|
"loss": 0.0909, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.34328358208955223, |
|
"grad_norm": 0.6406155904616685, |
|
"learning_rate": 9.290525179375722e-06, |
|
"loss": 0.0817, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.3582089552238806, |
|
"grad_norm": 0.6570196434589318, |
|
"learning_rate": 9.229160296295488e-06, |
|
"loss": 0.0847, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.373134328358209, |
|
"grad_norm": 0.5570171509580777, |
|
"learning_rate": 9.165470940770458e-06, |
|
"loss": 0.0739, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.3880597014925373, |
|
"grad_norm": 0.620261743808898, |
|
"learning_rate": 9.099492118367123e-06, |
|
"loss": 0.0861, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.40298507462686567, |
|
"grad_norm": 0.6886602468358877, |
|
"learning_rate": 9.03126009301115e-06, |
|
"loss": 0.0943, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.417910447761194, |
|
"grad_norm": 0.7131708436753863, |
|
"learning_rate": 8.960812367055646e-06, |
|
"loss": 0.0892, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.43283582089552236, |
|
"grad_norm": 0.6391387675910597, |
|
"learning_rate": 8.888187660668762e-06, |
|
"loss": 0.0832, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.44776119402985076, |
|
"grad_norm": 0.6940047679170313, |
|
"learning_rate": 8.81342589055191e-06, |
|
"loss": 0.0906, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.4626865671641791, |
|
"grad_norm": 0.6503008059328946, |
|
"learning_rate": 8.736568148000386e-06, |
|
"loss": 0.0968, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.47761194029850745, |
|
"grad_norm": 0.6090207680984978, |
|
"learning_rate": 8.657656676318346e-06, |
|
"loss": 0.0847, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.4925373134328358, |
|
"grad_norm": 0.6434605637551735, |
|
"learning_rate": 8.576734847600639e-06, |
|
"loss": 0.0853, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.5074626865671642, |
|
"grad_norm": 0.6377279956113436, |
|
"learning_rate": 8.49384713889421e-06, |
|
"loss": 0.0899, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.5223880597014925, |
|
"grad_norm": 0.644362530558179, |
|
"learning_rate": 8.40903910775219e-06, |
|
"loss": 0.0827, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.5373134328358209, |
|
"grad_norm": 0.642985561685258, |
|
"learning_rate": 8.32235736719411e-06, |
|
"loss": 0.0888, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.5522388059701493, |
|
"grad_norm": 0.6790092652999941, |
|
"learning_rate": 8.233849560085994e-06, |
|
"loss": 0.0846, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.5671641791044776, |
|
"grad_norm": 0.7854755458868027, |
|
"learning_rate": 8.143564332954426e-06, |
|
"loss": 0.1031, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.582089552238806, |
|
"grad_norm": 0.6030440972462469, |
|
"learning_rate": 8.051551309248961e-06, |
|
"loss": 0.0849, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.5970149253731343, |
|
"grad_norm": 0.6418639082548031, |
|
"learning_rate": 7.957861062067614e-06, |
|
"loss": 0.0852, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6119402985074627, |
|
"grad_norm": 0.6566500879981874, |
|
"learning_rate": 7.86254508636036e-06, |
|
"loss": 0.0887, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.6268656716417911, |
|
"grad_norm": 0.6290631945385817, |
|
"learning_rate": 7.765655770625997e-06, |
|
"loss": 0.0855, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.6417910447761194, |
|
"grad_norm": 0.6140886580472066, |
|
"learning_rate": 7.667246368117852e-06, |
|
"loss": 0.0818, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.6567164179104478, |
|
"grad_norm": 0.6174619223952549, |
|
"learning_rate": 7.56737096757421e-06, |
|
"loss": 0.0856, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.6716417910447762, |
|
"grad_norm": 0.6195862366153396, |
|
"learning_rate": 7.466084463489537e-06, |
|
"loss": 0.0801, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.6865671641791045, |
|
"grad_norm": 0.6331901290551701, |
|
"learning_rate": 7.363442525942827e-06, |
|
"loss": 0.0844, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.7014925373134329, |
|
"grad_norm": 0.7083908752048309, |
|
"learning_rate": 7.25950156999967e-06, |
|
"loss": 0.0894, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.7164179104477612, |
|
"grad_norm": 0.6388671244490378, |
|
"learning_rate": 7.1543187247048525e-06, |
|
"loss": 0.0906, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.7313432835820896, |
|
"grad_norm": 0.5777376817133963, |
|
"learning_rate": 7.047951801682533e-06, |
|
"loss": 0.0775, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.746268656716418, |
|
"grad_norm": 0.5907825361104937, |
|
"learning_rate": 6.9404592633612486e-06, |
|
"loss": 0.08, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.7611940298507462, |
|
"grad_norm": 0.6197333346964937, |
|
"learning_rate": 6.831900190841232e-06, |
|
"loss": 0.0852, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.7761194029850746, |
|
"grad_norm": 0.580913389747538, |
|
"learning_rate": 6.722334251421665e-06, |
|
"loss": 0.0786, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.7910447761194029, |
|
"grad_norm": 0.6769846397314266, |
|
"learning_rate": 6.611821665805769e-06, |
|
"loss": 0.0859, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.8059701492537313, |
|
"grad_norm": 0.6493382927830379, |
|
"learning_rate": 6.500423175001705e-06, |
|
"loss": 0.0918, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.8208955223880597, |
|
"grad_norm": 0.6002157339319569, |
|
"learning_rate": 6.388200006937503e-06, |
|
"loss": 0.089, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.835820895522388, |
|
"grad_norm": 0.5447504002720616, |
|
"learning_rate": 6.275213842808383e-06, |
|
"loss": 0.0731, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.8507462686567164, |
|
"grad_norm": 0.5621404387468115, |
|
"learning_rate": 6.161526783174917e-06, |
|
"loss": 0.0762, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.8656716417910447, |
|
"grad_norm": 0.6279143619008688, |
|
"learning_rate": 6.047201313830724e-06, |
|
"loss": 0.0921, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.8805970149253731, |
|
"grad_norm": 0.5567853485138207, |
|
"learning_rate": 5.932300271458406e-06, |
|
"loss": 0.0692, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.8955223880597015, |
|
"grad_norm": 0.5831390070498211, |
|
"learning_rate": 5.816886809092651e-06, |
|
"loss": 0.0777, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.9104477611940298, |
|
"grad_norm": 0.575437949186205, |
|
"learning_rate": 5.701024361409431e-06, |
|
"loss": 0.0803, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.9253731343283582, |
|
"grad_norm": 0.6339642524977537, |
|
"learning_rate": 5.584776609860414e-06, |
|
"loss": 0.0893, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.9402985074626866, |
|
"grad_norm": 0.5924535700885266, |
|
"learning_rate": 5.468207447671755e-06, |
|
"loss": 0.0844, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.9552238805970149, |
|
"grad_norm": 0.6155460855755083, |
|
"learning_rate": 5.351380944726465e-06, |
|
"loss": 0.0836, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.9701492537313433, |
|
"grad_norm": 0.640329747223743, |
|
"learning_rate": 5.234361312349701e-06, |
|
"loss": 0.0951, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.9850746268656716, |
|
"grad_norm": 0.5300911342515855, |
|
"learning_rate": 5.117212868016303e-06, |
|
"loss": 0.0655, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.4687094870612933, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0424, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.0149253731343284, |
|
"grad_norm": 0.4076938808158179, |
|
"learning_rate": 4.882787131983698e-06, |
|
"loss": 0.0319, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.0298507462686568, |
|
"grad_norm": 0.44699344595961127, |
|
"learning_rate": 4.765638687650299e-06, |
|
"loss": 0.0397, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.044776119402985, |
|
"grad_norm": 0.44767644296408526, |
|
"learning_rate": 4.6486190552735375e-06, |
|
"loss": 0.0335, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.0597014925373134, |
|
"grad_norm": 0.37505346262796413, |
|
"learning_rate": 4.531792552328247e-06, |
|
"loss": 0.0285, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.0746268656716418, |
|
"grad_norm": 0.4657077829483448, |
|
"learning_rate": 4.415223390139588e-06, |
|
"loss": 0.0326, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.0895522388059702, |
|
"grad_norm": 0.34306186183329557, |
|
"learning_rate": 4.2989756385905715e-06, |
|
"loss": 0.0244, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.1044776119402986, |
|
"grad_norm": 0.39032497882907513, |
|
"learning_rate": 4.183113190907349e-06, |
|
"loss": 0.027, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.1194029850746268, |
|
"grad_norm": 0.4702211175747881, |
|
"learning_rate": 4.067699728541595e-06, |
|
"loss": 0.0316, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.1343283582089552, |
|
"grad_norm": 0.4455655473637308, |
|
"learning_rate": 3.952798686169279e-06, |
|
"loss": 0.0303, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.1492537313432836, |
|
"grad_norm": 0.4371493860049261, |
|
"learning_rate": 3.838473216825085e-06, |
|
"loss": 0.0282, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.164179104477612, |
|
"grad_norm": 0.4289807711491057, |
|
"learning_rate": 3.7247861571916183e-06, |
|
"loss": 0.0272, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.1791044776119404, |
|
"grad_norm": 0.5368576489602601, |
|
"learning_rate": 3.611799993062497e-06, |
|
"loss": 0.0351, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.1940298507462686, |
|
"grad_norm": 0.6142429871519881, |
|
"learning_rate": 3.4995768249982975e-06, |
|
"loss": 0.0377, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.208955223880597, |
|
"grad_norm": 0.4437616080442814, |
|
"learning_rate": 3.388178334194232e-06, |
|
"loss": 0.0254, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.2238805970149254, |
|
"grad_norm": 0.425005368183552, |
|
"learning_rate": 3.2776657485783357e-06, |
|
"loss": 0.0217, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.2388059701492538, |
|
"grad_norm": 0.45656133078022915, |
|
"learning_rate": 3.168099809158769e-06, |
|
"loss": 0.0257, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.2537313432835822, |
|
"grad_norm": 0.543019359956107, |
|
"learning_rate": 3.059540736638751e-06, |
|
"loss": 0.0291, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.2686567164179103, |
|
"grad_norm": 0.48489281581823257, |
|
"learning_rate": 2.9520481983174675e-06, |
|
"loss": 0.0249, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.2835820895522387, |
|
"grad_norm": 0.5065165276548232, |
|
"learning_rate": 2.8456812752951483e-06, |
|
"loss": 0.0226, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.2985074626865671, |
|
"grad_norm": 0.5835532502940851, |
|
"learning_rate": 2.740498430000332e-06, |
|
"loss": 0.0262, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.3134328358208955, |
|
"grad_norm": 0.41781499689334245, |
|
"learning_rate": 2.636557474057173e-06, |
|
"loss": 0.0171, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.328358208955224, |
|
"grad_norm": 0.60361339503175, |
|
"learning_rate": 2.533915536510464e-06, |
|
"loss": 0.0249, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.3432835820895521, |
|
"grad_norm": 0.47456828088394937, |
|
"learning_rate": 2.4326290324257896e-06, |
|
"loss": 0.0246, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.3582089552238805, |
|
"grad_norm": 0.4465597229891302, |
|
"learning_rate": 2.3327536318821496e-06, |
|
"loss": 0.0183, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.373134328358209, |
|
"grad_norm": 0.49082928585159896, |
|
"learning_rate": 2.234344229374003e-06, |
|
"loss": 0.0272, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.3880597014925373, |
|
"grad_norm": 0.4703289245897022, |
|
"learning_rate": 2.1374549136396417e-06, |
|
"loss": 0.0259, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.4029850746268657, |
|
"grad_norm": 0.4919381501347153, |
|
"learning_rate": 2.042138937932388e-06, |
|
"loss": 0.0244, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.417910447761194, |
|
"grad_norm": 0.48234747726237315, |
|
"learning_rate": 1.9484486907510405e-06, |
|
"loss": 0.0259, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.4328358208955223, |
|
"grad_norm": 0.46679191916730023, |
|
"learning_rate": 1.856435667045577e-06, |
|
"loss": 0.0248, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.4477611940298507, |
|
"grad_norm": 0.4915526917266499, |
|
"learning_rate": 1.7661504399140066e-06, |
|
"loss": 0.0253, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.462686567164179, |
|
"grad_norm": 0.4959939539468189, |
|
"learning_rate": 1.677642632805892e-06, |
|
"loss": 0.029, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.4776119402985075, |
|
"grad_norm": 0.47533058613210927, |
|
"learning_rate": 1.5909608922478108e-06, |
|
"loss": 0.0222, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.4925373134328357, |
|
"grad_norm": 0.41184618371805026, |
|
"learning_rate": 1.5061528611057917e-06, |
|
"loss": 0.0186, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.5074626865671643, |
|
"grad_norm": 0.5029260437214431, |
|
"learning_rate": 1.4232651523993635e-06, |
|
"loss": 0.0246, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.5223880597014925, |
|
"grad_norm": 0.4856272674409962, |
|
"learning_rate": 1.3423433236816563e-06, |
|
"loss": 0.0262, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.537313432835821, |
|
"grad_norm": 0.5515452132397586, |
|
"learning_rate": 1.2634318519996148e-06, |
|
"loss": 0.0265, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.5522388059701493, |
|
"grad_norm": 0.48385801987338495, |
|
"learning_rate": 1.186574109448091e-06, |
|
"loss": 0.0253, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.5671641791044775, |
|
"grad_norm": 0.4977052401413162, |
|
"learning_rate": 1.1118123393312397e-06, |
|
"loss": 0.0317, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.582089552238806, |
|
"grad_norm": 0.5013927727792972, |
|
"learning_rate": 1.0391876329443534e-06, |
|
"loss": 0.0248, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.5970149253731343, |
|
"grad_norm": 0.5012798267287072, |
|
"learning_rate": 9.687399069888515e-07, |
|
"loss": 0.0253, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.6119402985074627, |
|
"grad_norm": 0.4798753241925998, |
|
"learning_rate": 9.005078816328772e-07, |
|
"loss": 0.0208, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.626865671641791, |
|
"grad_norm": 0.4892720130785747, |
|
"learning_rate": 8.345290592295429e-07, |
|
"loss": 0.0256, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.6417910447761193, |
|
"grad_norm": 0.44880610381720976, |
|
"learning_rate": 7.708397037045129e-07, |
|
"loss": 0.024, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.6567164179104479, |
|
"grad_norm": 0.5198878749228643, |
|
"learning_rate": 7.094748206242797e-07, |
|
"loss": 0.0282, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.671641791044776, |
|
"grad_norm": 0.4259309199030039, |
|
"learning_rate": 6.50468137956049e-07, |
|
"loss": 0.0203, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.6865671641791045, |
|
"grad_norm": 0.44723898312223315, |
|
"learning_rate": 5.938520875298587e-07, |
|
"loss": 0.0262, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.7014925373134329, |
|
"grad_norm": 0.4649229524932995, |
|
"learning_rate": 5.396577872130676e-07, |
|
"loss": 0.0234, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.716417910447761, |
|
"grad_norm": 0.46689935485099987, |
|
"learning_rate": 4.879150238070585e-07, |
|
"loss": 0.0235, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.7313432835820897, |
|
"grad_norm": 0.4250501253858005, |
|
"learning_rate": 4.386522366755169e-07, |
|
"loss": 0.021, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.7462686567164178, |
|
"grad_norm": 0.4965625077414352, |
|
"learning_rate": 3.918965021133131e-07, |
|
"loss": 0.0285, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.7611940298507462, |
|
"grad_norm": 0.5192751153508381, |
|
"learning_rate": 3.4767351846456744e-07, |
|
"loss": 0.0294, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.7761194029850746, |
|
"grad_norm": 0.4555879578505303, |
|
"learning_rate": 3.0600759199806815e-07, |
|
"loss": 0.026, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.7910447761194028, |
|
"grad_norm": 0.4702463723473376, |
|
"learning_rate": 2.669216235478295e-07, |
|
"loss": 0.0268, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.8059701492537314, |
|
"grad_norm": 0.5014638902986028, |
|
"learning_rate": 2.3043709592610486e-07, |
|
"loss": 0.0282, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.8208955223880596, |
|
"grad_norm": 0.5015871929717038, |
|
"learning_rate": 1.9657406211579966e-07, |
|
"loss": 0.0276, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.835820895522388, |
|
"grad_norm": 0.46199043552912816, |
|
"learning_rate": 1.6535113424874683e-07, |
|
"loss": 0.0253, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.8507462686567164, |
|
"grad_norm": 0.472386855814729, |
|
"learning_rate": 1.3678547337593494e-07, |
|
"loss": 0.0231, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.8656716417910446, |
|
"grad_norm": 0.44628541420234796, |
|
"learning_rate": 1.1089278003527438e-07, |
|
"loss": 0.0229, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.8805970149253732, |
|
"grad_norm": 0.4945658271403232, |
|
"learning_rate": 8.768728562211948e-08, |
|
"loss": 0.0259, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.8955223880597014, |
|
"grad_norm": 0.4631666908930044, |
|
"learning_rate": 6.718174456726789e-08, |
|
"loss": 0.0254, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.9104477611940298, |
|
"grad_norm": 0.47821378359248506, |
|
"learning_rate": 4.9387427326745287e-08, |
|
"loss": 0.0257, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.9253731343283582, |
|
"grad_norm": 0.4463060792083917, |
|
"learning_rate": 3.431411418722941e-08, |
|
"loss": 0.023, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.9402985074626866, |
|
"grad_norm": 0.4458879106581698, |
|
"learning_rate": 2.1970089890509527e-08, |
|
"loss": 0.0237, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.955223880597015, |
|
"grad_norm": 0.39654579447619215, |
|
"learning_rate": 1.2362139079949431e-08, |
|
"loss": 0.0193, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.9701492537313432, |
|
"grad_norm": 0.4384754664189478, |
|
"learning_rate": 5.495542571443135e-09, |
|
"loss": 0.0227, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.9850746268656716, |
|
"grad_norm": 0.40331068244695245, |
|
"learning_rate": 1.3740744509205263e-09, |
|
"loss": 0.0209, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.3037784601924941, |
|
"learning_rate": 0.0, |
|
"loss": 0.013, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 134, |
|
"total_flos": 8781261963264.0, |
|
"train_loss": 0.05639281026574213, |
|
"train_runtime": 528.9145, |
|
"train_samples_per_second": 2.0, |
|
"train_steps_per_second": 0.253 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 134, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 2000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8781261963264.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|