{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 200, "global_step": 134, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014925373134328358, "grad_norm": 2.4583136454633454, "learning_rate": 9.99862592554908e-06, "loss": 0.1709, "step": 1 }, { "epoch": 0.029850746268656716, "grad_norm": 1.2601474874951664, "learning_rate": 9.994504457428557e-06, "loss": 0.1095, "step": 2 }, { "epoch": 0.04477611940298507, "grad_norm": 1.8921245514823541, "learning_rate": 9.987637860920053e-06, "loss": 0.1123, "step": 3 }, { "epoch": 0.05970149253731343, "grad_norm": 1.0381477254848812, "learning_rate": 9.978029910109491e-06, "loss": 0.0897, "step": 4 }, { "epoch": 0.07462686567164178, "grad_norm": 0.8231161139163885, "learning_rate": 9.965685885812773e-06, "loss": 0.0804, "step": 5 }, { "epoch": 0.08955223880597014, "grad_norm": 3.3306861693595704, "learning_rate": 9.950612572673255e-06, "loss": 0.1197, "step": 6 }, { "epoch": 0.1044776119402985, "grad_norm": 0.8873942793156309, "learning_rate": 9.932818255432733e-06, "loss": 0.1052, "step": 7 }, { "epoch": 0.11940298507462686, "grad_norm": 0.7726110244054883, "learning_rate": 9.91231271437788e-06, "loss": 0.09, "step": 8 }, { "epoch": 0.13432835820895522, "grad_norm": 0.7713735038013977, "learning_rate": 9.889107219964726e-06, "loss": 0.0847, "step": 9 }, { "epoch": 0.14925373134328357, "grad_norm": 0.8179878874298719, "learning_rate": 9.863214526624065e-06, "loss": 0.0899, "step": 10 }, { "epoch": 0.16417910447761194, "grad_norm": 0.793854117532711, "learning_rate": 9.834648865751254e-06, "loss": 0.0885, "step": 11 }, { "epoch": 0.1791044776119403, "grad_norm": 0.6649283964833135, "learning_rate": 9.803425937884202e-06, "loss": 0.078, "step": 12 }, { "epoch": 0.19402985074626866, "grad_norm": 0.8490726920421898, "learning_rate": 9.769562904073896e-06, "loss": 0.0878, "step": 13 }, { "epoch": 0.208955223880597, "grad_norm": 0.7411456396869534, "learning_rate": 9.733078376452172e-06, "loss": 0.0881, "step": 14 }, { "epoch": 0.22388059701492538, "grad_norm": 0.7448122603540034, "learning_rate": 9.693992408001934e-06, "loss": 0.091, "step": 15 }, { "epoch": 0.23880597014925373, "grad_norm": 0.672661340651816, "learning_rate": 9.652326481535434e-06, "loss": 0.0847, "step": 16 }, { "epoch": 0.2537313432835821, "grad_norm": 0.6295351920280576, "learning_rate": 9.608103497886687e-06, "loss": 0.0751, "step": 17 }, { "epoch": 0.26865671641791045, "grad_norm": 0.5987419993650692, "learning_rate": 9.561347763324484e-06, "loss": 0.0757, "step": 18 }, { "epoch": 0.2835820895522388, "grad_norm": 0.6862099670787631, "learning_rate": 9.512084976192944e-06, "loss": 0.0832, "step": 19 }, { "epoch": 0.29850746268656714, "grad_norm": 0.6474317718234073, "learning_rate": 9.460342212786933e-06, "loss": 0.0785, "step": 20 }, { "epoch": 0.31343283582089554, "grad_norm": 0.6520670338068375, "learning_rate": 9.406147912470142e-06, "loss": 0.0832, "step": 21 }, { "epoch": 0.3283582089552239, "grad_norm": 0.6875547799836488, "learning_rate": 9.349531862043952e-06, "loss": 0.0909, "step": 22 }, { "epoch": 0.34328358208955223, "grad_norm": 0.6406155904616685, "learning_rate": 9.290525179375722e-06, "loss": 0.0817, "step": 23 }, { "epoch": 0.3582089552238806, "grad_norm": 0.6570196434589318, "learning_rate": 9.229160296295488e-06, "loss": 0.0847, "step": 24 }, { "epoch": 0.373134328358209, "grad_norm": 0.5570171509580777, "learning_rate": 9.165470940770458e-06, "loss": 0.0739, "step": 25 }, { "epoch": 0.3880597014925373, "grad_norm": 0.620261743808898, "learning_rate": 9.099492118367123e-06, "loss": 0.0861, "step": 26 }, { "epoch": 0.40298507462686567, "grad_norm": 0.6886602468358877, "learning_rate": 9.03126009301115e-06, "loss": 0.0943, "step": 27 }, { "epoch": 0.417910447761194, "grad_norm": 0.7131708436753863, "learning_rate": 8.960812367055646e-06, "loss": 0.0892, "step": 28 }, { "epoch": 0.43283582089552236, "grad_norm": 0.6391387675910597, "learning_rate": 8.888187660668762e-06, "loss": 0.0832, "step": 29 }, { "epoch": 0.44776119402985076, "grad_norm": 0.6940047679170313, "learning_rate": 8.81342589055191e-06, "loss": 0.0906, "step": 30 }, { "epoch": 0.4626865671641791, "grad_norm": 0.6503008059328946, "learning_rate": 8.736568148000386e-06, "loss": 0.0968, "step": 31 }, { "epoch": 0.47761194029850745, "grad_norm": 0.6090207680984978, "learning_rate": 8.657656676318346e-06, "loss": 0.0847, "step": 32 }, { "epoch": 0.4925373134328358, "grad_norm": 0.6434605637551735, "learning_rate": 8.576734847600639e-06, "loss": 0.0853, "step": 33 }, { "epoch": 0.5074626865671642, "grad_norm": 0.6377279956113436, "learning_rate": 8.49384713889421e-06, "loss": 0.0899, "step": 34 }, { "epoch": 0.5223880597014925, "grad_norm": 0.644362530558179, "learning_rate": 8.40903910775219e-06, "loss": 0.0827, "step": 35 }, { "epoch": 0.5373134328358209, "grad_norm": 0.642985561685258, "learning_rate": 8.32235736719411e-06, "loss": 0.0888, "step": 36 }, { "epoch": 0.5522388059701493, "grad_norm": 0.6790092652999941, "learning_rate": 8.233849560085994e-06, "loss": 0.0846, "step": 37 }, { "epoch": 0.5671641791044776, "grad_norm": 0.7854755458868027, "learning_rate": 8.143564332954426e-06, "loss": 0.1031, "step": 38 }, { "epoch": 0.582089552238806, "grad_norm": 0.6030440972462469, "learning_rate": 8.051551309248961e-06, "loss": 0.0849, "step": 39 }, { "epoch": 0.5970149253731343, "grad_norm": 0.6418639082548031, "learning_rate": 7.957861062067614e-06, "loss": 0.0852, "step": 40 }, { "epoch": 0.6119402985074627, "grad_norm": 0.6566500879981874, "learning_rate": 7.86254508636036e-06, "loss": 0.0887, "step": 41 }, { "epoch": 0.6268656716417911, "grad_norm": 0.6290631945385817, "learning_rate": 7.765655770625997e-06, "loss": 0.0855, "step": 42 }, { "epoch": 0.6417910447761194, "grad_norm": 0.6140886580472066, "learning_rate": 7.667246368117852e-06, "loss": 0.0818, "step": 43 }, { "epoch": 0.6567164179104478, "grad_norm": 0.6174619223952549, "learning_rate": 7.56737096757421e-06, "loss": 0.0856, "step": 44 }, { "epoch": 0.6716417910447762, "grad_norm": 0.6195862366153396, "learning_rate": 7.466084463489537e-06, "loss": 0.0801, "step": 45 }, { "epoch": 0.6865671641791045, "grad_norm": 0.6331901290551701, "learning_rate": 7.363442525942827e-06, "loss": 0.0844, "step": 46 }, { "epoch": 0.7014925373134329, "grad_norm": 0.7083908752048309, "learning_rate": 7.25950156999967e-06, "loss": 0.0894, "step": 47 }, { "epoch": 0.7164179104477612, "grad_norm": 0.6388671244490378, "learning_rate": 7.1543187247048525e-06, "loss": 0.0906, "step": 48 }, { "epoch": 0.7313432835820896, "grad_norm": 0.5777376817133963, "learning_rate": 7.047951801682533e-06, "loss": 0.0775, "step": 49 }, { "epoch": 0.746268656716418, "grad_norm": 0.5907825361104937, "learning_rate": 6.9404592633612486e-06, "loss": 0.08, "step": 50 }, { "epoch": 0.7611940298507462, "grad_norm": 0.6197333346964937, "learning_rate": 6.831900190841232e-06, "loss": 0.0852, "step": 51 }, { "epoch": 0.7761194029850746, "grad_norm": 0.580913389747538, "learning_rate": 6.722334251421665e-06, "loss": 0.0786, "step": 52 }, { "epoch": 0.7910447761194029, "grad_norm": 0.6769846397314266, "learning_rate": 6.611821665805769e-06, "loss": 0.0859, "step": 53 }, { "epoch": 0.8059701492537313, "grad_norm": 0.6493382927830379, "learning_rate": 6.500423175001705e-06, "loss": 0.0918, "step": 54 }, { "epoch": 0.8208955223880597, "grad_norm": 0.6002157339319569, "learning_rate": 6.388200006937503e-06, "loss": 0.089, "step": 55 }, { "epoch": 0.835820895522388, "grad_norm": 0.5447504002720616, "learning_rate": 6.275213842808383e-06, "loss": 0.0731, "step": 56 }, { "epoch": 0.8507462686567164, "grad_norm": 0.5621404387468115, "learning_rate": 6.161526783174917e-06, "loss": 0.0762, "step": 57 }, { "epoch": 0.8656716417910447, "grad_norm": 0.6279143619008688, "learning_rate": 6.047201313830724e-06, "loss": 0.0921, "step": 58 }, { "epoch": 0.8805970149253731, "grad_norm": 0.5567853485138207, "learning_rate": 5.932300271458406e-06, "loss": 0.0692, "step": 59 }, { "epoch": 0.8955223880597015, "grad_norm": 0.5831390070498211, "learning_rate": 5.816886809092651e-06, "loss": 0.0777, "step": 60 }, { "epoch": 0.9104477611940298, "grad_norm": 0.575437949186205, "learning_rate": 5.701024361409431e-06, "loss": 0.0803, "step": 61 }, { "epoch": 0.9253731343283582, "grad_norm": 0.6339642524977537, "learning_rate": 5.584776609860414e-06, "loss": 0.0893, "step": 62 }, { "epoch": 0.9402985074626866, "grad_norm": 0.5924535700885266, "learning_rate": 5.468207447671755e-06, "loss": 0.0844, "step": 63 }, { "epoch": 0.9552238805970149, "grad_norm": 0.6155460855755083, "learning_rate": 5.351380944726465e-06, "loss": 0.0836, "step": 64 }, { "epoch": 0.9701492537313433, "grad_norm": 0.640329747223743, "learning_rate": 5.234361312349701e-06, "loss": 0.0951, "step": 65 }, { "epoch": 0.9850746268656716, "grad_norm": 0.5300911342515855, "learning_rate": 5.117212868016303e-06, "loss": 0.0655, "step": 66 }, { "epoch": 1.0, "grad_norm": 0.4687094870612933, "learning_rate": 5e-06, "loss": 0.0424, "step": 67 }, { "epoch": 1.0149253731343284, "grad_norm": 0.4076938808158179, "learning_rate": 4.882787131983698e-06, "loss": 0.0319, "step": 68 }, { "epoch": 1.0298507462686568, "grad_norm": 0.44699344595961127, "learning_rate": 4.765638687650299e-06, "loss": 0.0397, "step": 69 }, { "epoch": 1.044776119402985, "grad_norm": 0.44767644296408526, "learning_rate": 4.6486190552735375e-06, "loss": 0.0335, "step": 70 }, { "epoch": 1.0597014925373134, "grad_norm": 0.37505346262796413, "learning_rate": 4.531792552328247e-06, "loss": 0.0285, "step": 71 }, { "epoch": 1.0746268656716418, "grad_norm": 0.4657077829483448, "learning_rate": 4.415223390139588e-06, "loss": 0.0326, "step": 72 }, { "epoch": 1.0895522388059702, "grad_norm": 0.34306186183329557, "learning_rate": 4.2989756385905715e-06, "loss": 0.0244, "step": 73 }, { "epoch": 1.1044776119402986, "grad_norm": 0.39032497882907513, "learning_rate": 4.183113190907349e-06, "loss": 0.027, "step": 74 }, { "epoch": 1.1194029850746268, "grad_norm": 0.4702211175747881, "learning_rate": 4.067699728541595e-06, "loss": 0.0316, "step": 75 }, { "epoch": 1.1343283582089552, "grad_norm": 0.4455655473637308, "learning_rate": 3.952798686169279e-06, "loss": 0.0303, "step": 76 }, { "epoch": 1.1492537313432836, "grad_norm": 0.4371493860049261, "learning_rate": 3.838473216825085e-06, "loss": 0.0282, "step": 77 }, { "epoch": 1.164179104477612, "grad_norm": 0.4289807711491057, "learning_rate": 3.7247861571916183e-06, "loss": 0.0272, "step": 78 }, { "epoch": 1.1791044776119404, "grad_norm": 0.5368576489602601, "learning_rate": 3.611799993062497e-06, "loss": 0.0351, "step": 79 }, { "epoch": 1.1940298507462686, "grad_norm": 0.6142429871519881, "learning_rate": 3.4995768249982975e-06, "loss": 0.0377, "step": 80 }, { "epoch": 1.208955223880597, "grad_norm": 0.4437616080442814, "learning_rate": 3.388178334194232e-06, "loss": 0.0254, "step": 81 }, { "epoch": 1.2238805970149254, "grad_norm": 0.425005368183552, "learning_rate": 3.2776657485783357e-06, "loss": 0.0217, "step": 82 }, { "epoch": 1.2388059701492538, "grad_norm": 0.45656133078022915, "learning_rate": 3.168099809158769e-06, "loss": 0.0257, "step": 83 }, { "epoch": 1.2537313432835822, "grad_norm": 0.543019359956107, "learning_rate": 3.059540736638751e-06, "loss": 0.0291, "step": 84 }, { "epoch": 1.2686567164179103, "grad_norm": 0.48489281581823257, "learning_rate": 2.9520481983174675e-06, "loss": 0.0249, "step": 85 }, { "epoch": 1.2835820895522387, "grad_norm": 0.5065165276548232, "learning_rate": 2.8456812752951483e-06, "loss": 0.0226, "step": 86 }, { "epoch": 1.2985074626865671, "grad_norm": 0.5835532502940851, "learning_rate": 2.740498430000332e-06, "loss": 0.0262, "step": 87 }, { "epoch": 1.3134328358208955, "grad_norm": 0.41781499689334245, "learning_rate": 2.636557474057173e-06, "loss": 0.0171, "step": 88 }, { "epoch": 1.328358208955224, "grad_norm": 0.60361339503175, "learning_rate": 2.533915536510464e-06, "loss": 0.0249, "step": 89 }, { "epoch": 1.3432835820895521, "grad_norm": 0.47456828088394937, "learning_rate": 2.4326290324257896e-06, "loss": 0.0246, "step": 90 }, { "epoch": 1.3582089552238805, "grad_norm": 0.4465597229891302, "learning_rate": 2.3327536318821496e-06, "loss": 0.0183, "step": 91 }, { "epoch": 1.373134328358209, "grad_norm": 0.49082928585159896, "learning_rate": 2.234344229374003e-06, "loss": 0.0272, "step": 92 }, { "epoch": 1.3880597014925373, "grad_norm": 0.4703289245897022, "learning_rate": 2.1374549136396417e-06, "loss": 0.0259, "step": 93 }, { "epoch": 1.4029850746268657, "grad_norm": 0.4919381501347153, "learning_rate": 2.042138937932388e-06, "loss": 0.0244, "step": 94 }, { "epoch": 1.417910447761194, "grad_norm": 0.48234747726237315, "learning_rate": 1.9484486907510405e-06, "loss": 0.0259, "step": 95 }, { "epoch": 1.4328358208955223, "grad_norm": 0.46679191916730023, "learning_rate": 1.856435667045577e-06, "loss": 0.0248, "step": 96 }, { "epoch": 1.4477611940298507, "grad_norm": 0.4915526917266499, "learning_rate": 1.7661504399140066e-06, "loss": 0.0253, "step": 97 }, { "epoch": 1.462686567164179, "grad_norm": 0.4959939539468189, "learning_rate": 1.677642632805892e-06, "loss": 0.029, "step": 98 }, { "epoch": 1.4776119402985075, "grad_norm": 0.47533058613210927, "learning_rate": 1.5909608922478108e-06, "loss": 0.0222, "step": 99 }, { "epoch": 1.4925373134328357, "grad_norm": 0.41184618371805026, "learning_rate": 1.5061528611057917e-06, "loss": 0.0186, "step": 100 }, { "epoch": 1.5074626865671643, "grad_norm": 0.5029260437214431, "learning_rate": 1.4232651523993635e-06, "loss": 0.0246, "step": 101 }, { "epoch": 1.5223880597014925, "grad_norm": 0.4856272674409962, "learning_rate": 1.3423433236816563e-06, "loss": 0.0262, "step": 102 }, { "epoch": 1.537313432835821, "grad_norm": 0.5515452132397586, "learning_rate": 1.2634318519996148e-06, "loss": 0.0265, "step": 103 }, { "epoch": 1.5522388059701493, "grad_norm": 0.48385801987338495, "learning_rate": 1.186574109448091e-06, "loss": 0.0253, "step": 104 }, { "epoch": 1.5671641791044775, "grad_norm": 0.4977052401413162, "learning_rate": 1.1118123393312397e-06, "loss": 0.0317, "step": 105 }, { "epoch": 1.582089552238806, "grad_norm": 0.5013927727792972, "learning_rate": 1.0391876329443534e-06, "loss": 0.0248, "step": 106 }, { "epoch": 1.5970149253731343, "grad_norm": 0.5012798267287072, "learning_rate": 9.687399069888515e-07, "loss": 0.0253, "step": 107 }, { "epoch": 1.6119402985074627, "grad_norm": 0.4798753241925998, "learning_rate": 9.005078816328772e-07, "loss": 0.0208, "step": 108 }, { "epoch": 1.626865671641791, "grad_norm": 0.4892720130785747, "learning_rate": 8.345290592295429e-07, "loss": 0.0256, "step": 109 }, { "epoch": 1.6417910447761193, "grad_norm": 0.44880610381720976, "learning_rate": 7.708397037045129e-07, "loss": 0.024, "step": 110 }, { "epoch": 1.6567164179104479, "grad_norm": 0.5198878749228643, "learning_rate": 7.094748206242797e-07, "loss": 0.0282, "step": 111 }, { "epoch": 1.671641791044776, "grad_norm": 0.4259309199030039, "learning_rate": 6.50468137956049e-07, "loss": 0.0203, "step": 112 }, { "epoch": 1.6865671641791045, "grad_norm": 0.44723898312223315, "learning_rate": 5.938520875298587e-07, "loss": 0.0262, "step": 113 }, { "epoch": 1.7014925373134329, "grad_norm": 0.4649229524932995, "learning_rate": 5.396577872130676e-07, "loss": 0.0234, "step": 114 }, { "epoch": 1.716417910447761, "grad_norm": 0.46689935485099987, "learning_rate": 4.879150238070585e-07, "loss": 0.0235, "step": 115 }, { "epoch": 1.7313432835820897, "grad_norm": 0.4250501253858005, "learning_rate": 4.386522366755169e-07, "loss": 0.021, "step": 116 }, { "epoch": 1.7462686567164178, "grad_norm": 0.4965625077414352, "learning_rate": 3.918965021133131e-07, "loss": 0.0285, "step": 117 }, { "epoch": 1.7611940298507462, "grad_norm": 0.5192751153508381, "learning_rate": 3.4767351846456744e-07, "loss": 0.0294, "step": 118 }, { "epoch": 1.7761194029850746, "grad_norm": 0.4555879578505303, "learning_rate": 3.0600759199806815e-07, "loss": 0.026, "step": 119 }, { "epoch": 1.7910447761194028, "grad_norm": 0.4702463723473376, "learning_rate": 2.669216235478295e-07, "loss": 0.0268, "step": 120 }, { "epoch": 1.8059701492537314, "grad_norm": 0.5014638902986028, "learning_rate": 2.3043709592610486e-07, "loss": 0.0282, "step": 121 }, { "epoch": 1.8208955223880596, "grad_norm": 0.5015871929717038, "learning_rate": 1.9657406211579966e-07, "loss": 0.0276, "step": 122 }, { "epoch": 1.835820895522388, "grad_norm": 0.46199043552912816, "learning_rate": 1.6535113424874683e-07, "loss": 0.0253, "step": 123 }, { "epoch": 1.8507462686567164, "grad_norm": 0.472386855814729, "learning_rate": 1.3678547337593494e-07, "loss": 0.0231, "step": 124 }, { "epoch": 1.8656716417910446, "grad_norm": 0.44628541420234796, "learning_rate": 1.1089278003527438e-07, "loss": 0.0229, "step": 125 }, { "epoch": 1.8805970149253732, "grad_norm": 0.4945658271403232, "learning_rate": 8.768728562211948e-08, "loss": 0.0259, "step": 126 }, { "epoch": 1.8955223880597014, "grad_norm": 0.4631666908930044, "learning_rate": 6.718174456726789e-08, "loss": 0.0254, "step": 127 }, { "epoch": 1.9104477611940298, "grad_norm": 0.47821378359248506, "learning_rate": 4.9387427326745287e-08, "loss": 0.0257, "step": 128 }, { "epoch": 1.9253731343283582, "grad_norm": 0.4463060792083917, "learning_rate": 3.431411418722941e-08, "loss": 0.023, "step": 129 }, { "epoch": 1.9402985074626866, "grad_norm": 0.4458879106581698, "learning_rate": 2.1970089890509527e-08, "loss": 0.0237, "step": 130 }, { "epoch": 1.955223880597015, "grad_norm": 0.39654579447619215, "learning_rate": 1.2362139079949431e-08, "loss": 0.0193, "step": 131 }, { "epoch": 1.9701492537313432, "grad_norm": 0.4384754664189478, "learning_rate": 5.495542571443135e-09, "loss": 0.0227, "step": 132 }, { "epoch": 1.9850746268656716, "grad_norm": 0.40331068244695245, "learning_rate": 1.3740744509205263e-09, "loss": 0.0209, "step": 133 }, { "epoch": 2.0, "grad_norm": 0.3037784601924941, "learning_rate": 0.0, "loss": 0.013, "step": 134 }, { "epoch": 2.0, "step": 134, "total_flos": 8781261963264.0, "train_loss": 0.05639281026574213, "train_runtime": 528.9145, "train_samples_per_second": 2.0, "train_steps_per_second": 0.253 } ], "logging_steps": 1, "max_steps": 134, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8781261963264.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }