diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10281 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999059885306008, + "eval_steps": 500, + "global_step": 7977, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.833333333333334e-07, + "loss": 3.0143, + "loss_": 1.9639, + "moe_loss": 0.1727, + "moe_loss_longrong": 1.5072, + "step": 7 + }, + { + "epoch": 0.0, + "learning_rate": 1.1666666666666668e-06, + "loss": 3.1915, + "loss_": 1.6485, + "moe_loss": 0.1717, + "moe_loss_longrong": 1.5034, + "step": 14 + }, + { + "epoch": 0.0, + "learning_rate": 1.75e-06, + "loss": 3.1397, + "loss_": 1.5496, + "moe_loss": 0.171, + "moe_loss_longrong": 1.4995, + "step": 21 + }, + { + "epoch": 0.0, + "learning_rate": 2.3333333333333336e-06, + "loss": 3.0585, + "loss_": 2.0086, + "moe_loss": 0.1701, + "moe_loss_longrong": 1.4995, + "step": 28 + }, + { + "epoch": 0.0, + "learning_rate": 2.916666666666667e-06, + "loss": 2.9633, + "loss_": 1.0198, + "moe_loss": 0.1676, + "moe_loss_longrong": 1.5194, + "step": 35 + }, + { + "epoch": 0.01, + "learning_rate": 3.5e-06, + "loss": 3.0075, + "loss_": 1.4577, + "moe_loss": 0.1675, + "moe_loss_longrong": 1.4922, + "step": 42 + }, + { + "epoch": 0.01, + "learning_rate": 4.083333333333334e-06, + "loss": 2.9169, + "loss_": 1.446, + "moe_loss": 0.1664, + "moe_loss_longrong": 1.4899, + "step": 49 + }, + { + "epoch": 0.01, + "learning_rate": 4.666666666666667e-06, + "loss": 2.9903, + "loss_": 1.3955, + "moe_loss": 0.1647, + "moe_loss_longrong": 1.483, + "step": 56 + }, + { + "epoch": 0.01, + "learning_rate": 5.2500000000000006e-06, + "loss": 2.9445, + "loss_": 1.7934, + "moe_loss": 0.1646, + "moe_loss_longrong": 1.4828, + "step": 63 + }, + { + "epoch": 0.01, + "learning_rate": 5.833333333333334e-06, + "loss": 2.9122, + "loss_": 1.4323, + "moe_loss": 0.1631, + "moe_loss_longrong": 1.4793, + "step": 70 + }, + { + "epoch": 0.01, + "learning_rate": 6.416666666666667e-06, + "loss": 2.87, + "loss_": 0.8099, + "moe_loss": 0.1652, + "moe_loss_longrong": 1.5108, + "step": 77 + }, + { + "epoch": 0.01, + "learning_rate": 7e-06, + "loss": 2.931, + "loss_": 1.4784, + "moe_loss": 0.1621, + "moe_loss_longrong": 1.473, + "step": 84 + }, + { + "epoch": 0.01, + "learning_rate": 7.583333333333333e-06, + "loss": 2.7996, + "loss_": 1.0334, + "moe_loss": 0.1645, + "moe_loss_longrong": 1.5067, + "step": 91 + }, + { + "epoch": 0.01, + "learning_rate": 8.166666666666668e-06, + "loss": 2.8791, + "loss_": 1.4959, + "moe_loss": 0.1617, + "moe_loss_longrong": 1.4725, + "step": 98 + }, + { + "epoch": 0.01, + "learning_rate": 8.750000000000001e-06, + "loss": 2.8593, + "loss_": 1.3625, + "moe_loss": 0.1617, + "moe_loss_longrong": 1.4711, + "step": 105 + }, + { + "epoch": 0.01, + "learning_rate": 9.333333333333334e-06, + "loss": 2.8592, + "loss_": 1.0826, + "moe_loss": 0.1639, + "moe_loss_longrong": 1.5051, + "step": 112 + }, + { + "epoch": 0.01, + "learning_rate": 9.916666666666668e-06, + "loss": 2.8772, + "loss_": 1.2496, + "moe_loss": 0.1616, + "moe_loss_longrong": 1.4683, + "step": 119 + }, + { + "epoch": 0.02, + "learning_rate": 1.0500000000000001e-05, + "loss": 2.7839, + "loss_": 0.8619, + "moe_loss": 0.1636, + "moe_loss_longrong": 1.5017, + "step": 126 + }, + { + "epoch": 0.02, + "learning_rate": 1.1083333333333335e-05, + "loss": 2.845, + "loss_": 1.519, + "moe_loss": 0.1615, + "moe_loss_longrong": 1.4653, + "step": 133 + }, + { + "epoch": 0.02, + "learning_rate": 1.1666666666666668e-05, + "loss": 2.8779, + "loss_": 1.4328, + "moe_loss": 0.1615, + "moe_loss_longrong": 1.4632, + "step": 140 + }, + { + "epoch": 0.02, + "learning_rate": 1.2250000000000001e-05, + "loss": 2.8133, + "loss_": 1.7345, + "moe_loss": 0.1615, + "moe_loss_longrong": 1.4635, + "step": 147 + }, + { + "epoch": 0.02, + "learning_rate": 1.2833333333333335e-05, + "loss": 2.8421, + "loss_": 1.443, + "moe_loss": 0.1615, + "moe_loss_longrong": 1.4609, + "step": 154 + }, + { + "epoch": 0.02, + "learning_rate": 1.3416666666666666e-05, + "loss": 2.8433, + "loss_": 1.0833, + "moe_loss": 0.1613, + "moe_loss_longrong": 1.4601, + "step": 161 + }, + { + "epoch": 0.02, + "learning_rate": 1.4e-05, + "loss": 2.7887, + "loss_": 1.1754, + "moe_loss": 0.1611, + "moe_loss_longrong": 1.458, + "step": 168 + }, + { + "epoch": 0.02, + "learning_rate": 1.4583333333333333e-05, + "loss": 2.8346, + "loss_": 1.4786, + "moe_loss": 0.1615, + "moe_loss_longrong": 1.461, + "step": 175 + }, + { + "epoch": 0.02, + "learning_rate": 1.5166666666666667e-05, + "loss": 2.8158, + "loss_": 1.1078, + "moe_loss": 0.161, + "moe_loss_longrong": 1.4563, + "step": 182 + }, + { + "epoch": 0.02, + "learning_rate": 1.575e-05, + "loss": 2.8165, + "loss_": 1.5185, + "moe_loss": 0.1612, + "moe_loss_longrong": 1.4575, + "step": 189 + }, + { + "epoch": 0.02, + "learning_rate": 1.6333333333333335e-05, + "loss": 2.7353, + "loss_": 1.5306, + "moe_loss": 0.1612, + "moe_loss_longrong": 1.4575, + "step": 196 + }, + { + "epoch": 0.03, + "learning_rate": 1.6916666666666667e-05, + "loss": 2.8177, + "loss_": 1.6701, + "moe_loss": 0.1613, + "moe_loss_longrong": 1.4552, + "step": 203 + }, + { + "epoch": 0.03, + "learning_rate": 1.7500000000000002e-05, + "loss": 2.8141, + "loss_": 1.147, + "moe_loss": 0.1611, + "moe_loss_longrong": 1.454, + "step": 210 + }, + { + "epoch": 0.03, + "learning_rate": 1.8083333333333334e-05, + "loss": 2.7925, + "loss_": 1.1863, + "moe_loss": 0.1612, + "moe_loss_longrong": 1.454, + "step": 217 + }, + { + "epoch": 0.03, + "learning_rate": 1.866666666666667e-05, + "loss": 2.7435, + "loss_": 1.2765, + "moe_loss": 0.1611, + "moe_loss_longrong": 1.4537, + "step": 224 + }, + { + "epoch": 0.03, + "learning_rate": 1.925e-05, + "loss": 2.7391, + "loss_": 1.1006, + "moe_loss": 0.1612, + "moe_loss_longrong": 1.4517, + "step": 231 + }, + { + "epoch": 0.03, + "learning_rate": 1.9833333333333335e-05, + "loss": 2.7548, + "loss_": 1.1628, + "moe_loss": 0.1612, + "moe_loss_longrong": 1.4493, + "step": 238 + }, + { + "epoch": 0.03, + "learning_rate": 1.999997939064427e-05, + "loss": 2.8216, + "loss_": 1.3669, + "moe_loss": 0.1613, + "moe_loss_longrong": 1.4484, + "step": 245 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999881290305082e-05, + "loss": 2.7406, + "loss_": 0.9434, + "moe_loss": 0.1621, + "moe_loss_longrong": 1.4837, + "step": 252 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999702402277115e-05, + "loss": 2.7835, + "loss_": 1.6082, + "moe_loss": 0.1614, + "moe_loss_longrong": 1.4501, + "step": 259 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999442728005572e-05, + "loss": 2.783, + "loss_": 1.3989, + "moe_loss": 0.161, + "moe_loss_longrong": 1.4474, + "step": 266 + }, + { + "epoch": 0.03, + "learning_rate": 1.999910226958833e-05, + "loss": 2.7628, + "loss_": 1.2652, + "moe_loss": 0.161, + "moe_loss_longrong": 1.4481, + "step": 273 + }, + { + "epoch": 0.04, + "learning_rate": 1.9998681029775905e-05, + "loss": 2.7709, + "loss_": 1.4145, + "moe_loss": 0.1609, + "moe_loss_longrong": 1.4454, + "step": 280 + }, + { + "epoch": 0.04, + "learning_rate": 1.999817901197144e-05, + "loss": 2.7808, + "loss_": 1.167, + "moe_loss": 0.1612, + "moe_loss_longrong": 1.4457, + "step": 287 + }, + { + "epoch": 0.04, + "learning_rate": 1.9997596220230666e-05, + "loss": 2.7761, + "loss_": 1.1866, + "moe_loss": 0.161, + "moe_loss_longrong": 1.4467, + "step": 294 + }, + { + "epoch": 0.04, + "learning_rate": 1.999693265926188e-05, + "loss": 2.7605, + "loss_": 1.2065, + "moe_loss": 0.1608, + "moe_loss_longrong": 1.4448, + "step": 301 + }, + { + "epoch": 0.04, + "learning_rate": 1.99961883344259e-05, + "loss": 2.7849, + "loss_": 1.2751, + "moe_loss": 0.161, + "moe_loss_longrong": 1.4447, + "step": 308 + }, + { + "epoch": 0.04, + "learning_rate": 1.9995363251736027e-05, + "loss": 2.7919, + "loss_": 1.1653, + "moe_loss": 0.1608, + "moe_loss_longrong": 1.4426, + "step": 315 + }, + { + "epoch": 0.04, + "learning_rate": 1.9994457417857998e-05, + "loss": 2.7698, + "loss_": 1.4965, + "moe_loss": 0.161, + "moe_loss_longrong": 1.4423, + "step": 322 + }, + { + "epoch": 0.04, + "learning_rate": 1.999347084010991e-05, + "loss": 2.7421, + "loss_": 0.8783, + "moe_loss": 0.1608, + "moe_loss_longrong": 1.4427, + "step": 329 + }, + { + "epoch": 0.04, + "learning_rate": 1.99924035264622e-05, + "loss": 2.7618, + "loss_": 1.2226, + "moe_loss": 0.161, + "moe_loss_longrong": 1.4429, + "step": 336 + }, + { + "epoch": 0.04, + "learning_rate": 1.9991255485537547e-05, + "loss": 2.76, + "loss_": 1.2206, + "moe_loss": 0.1609, + "moe_loss_longrong": 1.4425, + "step": 343 + }, + { + "epoch": 0.04, + "learning_rate": 1.999002672661082e-05, + "loss": 2.7533, + "loss_": 1.3051, + "moe_loss": 0.1609, + "moe_loss_longrong": 1.4398, + "step": 350 + }, + { + "epoch": 0.04, + "learning_rate": 1.9988717259609e-05, + "loss": 2.7161, + "loss_": 0.8665, + "moe_loss": 0.1615, + "moe_loss_longrong": 1.4731, + "step": 357 + }, + { + "epoch": 0.05, + "learning_rate": 1.9987327095111085e-05, + "loss": 2.7577, + "loss_": 1.1175, + "moe_loss": 0.1606, + "moe_loss_longrong": 1.4404, + "step": 364 + }, + { + "epoch": 0.05, + "learning_rate": 1.9985856244348034e-05, + "loss": 2.7281, + "loss_": 1.2506, + "moe_loss": 0.1607, + "moe_loss_longrong": 1.4399, + "step": 371 + }, + { + "epoch": 0.05, + "learning_rate": 1.9984304719202647e-05, + "loss": 2.7585, + "loss_": 1.6201, + "moe_loss": 0.161, + "moe_loss_longrong": 1.4402, + "step": 378 + }, + { + "epoch": 0.05, + "learning_rate": 1.9982672532209487e-05, + "loss": 2.7048, + "loss_": 1.3787, + "moe_loss": 0.1606, + "moe_loss_longrong": 1.44, + "step": 385 + }, + { + "epoch": 0.05, + "learning_rate": 1.998095969655477e-05, + "loss": 2.7554, + "loss_": 1.5028, + "moe_loss": 0.1607, + "moe_loss_longrong": 1.4382, + "step": 392 + }, + { + "epoch": 0.05, + "learning_rate": 1.997916622607627e-05, + "loss": 2.7604, + "loss_": 1.5144, + "moe_loss": 0.1608, + "moe_loss_longrong": 1.4369, + "step": 399 + }, + { + "epoch": 0.05, + "learning_rate": 1.9977292135263187e-05, + "loss": 2.6773, + "loss_": 1.2568, + "moe_loss": 0.1608, + "moe_loss_longrong": 1.4391, + "step": 406 + }, + { + "epoch": 0.05, + "learning_rate": 1.9975337439256046e-05, + "loss": 2.7524, + "loss_": 1.3851, + "moe_loss": 0.1608, + "moe_loss_longrong": 1.4373, + "step": 413 + }, + { + "epoch": 0.05, + "learning_rate": 1.9973302153846577e-05, + "loss": 2.7138, + "loss_": 0.945, + "moe_loss": 0.1607, + "moe_loss_longrong": 1.4365, + "step": 420 + }, + { + "epoch": 0.05, + "learning_rate": 1.9971186295477575e-05, + "loss": 2.723, + "loss_": 1.2851, + "moe_loss": 0.1606, + "moe_loss_longrong": 1.4373, + "step": 427 + }, + { + "epoch": 0.05, + "learning_rate": 1.9968989881242766e-05, + "loss": 2.7099, + "loss_": 1.4266, + "moe_loss": 0.1607, + "moe_loss_longrong": 1.4357, + "step": 434 + }, + { + "epoch": 0.06, + "learning_rate": 1.9966712928886697e-05, + "loss": 2.7253, + "loss_": 1.2214, + "moe_loss": 0.1608, + "moe_loss_longrong": 1.436, + "step": 441 + }, + { + "epoch": 0.06, + "learning_rate": 1.996435545680454e-05, + "loss": 2.7258, + "loss_": 1.2096, + "moe_loss": 0.1608, + "moe_loss_longrong": 1.4344, + "step": 448 + }, + { + "epoch": 0.06, + "learning_rate": 1.9961917484042012e-05, + "loss": 2.6884, + "loss_": 1.2, + "moe_loss": 0.1607, + "moe_loss_longrong": 1.4344, + "step": 455 + }, + { + "epoch": 0.06, + "learning_rate": 1.9959399030295158e-05, + "loss": 2.685, + "loss_": 0.9126, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4363, + "step": 462 + }, + { + "epoch": 0.06, + "learning_rate": 1.9956800115910216e-05, + "loss": 2.7146, + "loss_": 1.0302, + "moe_loss": 0.1607, + "moe_loss_longrong": 1.4344, + "step": 469 + }, + { + "epoch": 0.06, + "learning_rate": 1.995412076188348e-05, + "loss": 2.665, + "loss_": 1.4803, + "moe_loss": 0.1607, + "moe_loss_longrong": 1.4334, + "step": 476 + }, + { + "epoch": 0.06, + "learning_rate": 1.9951360989861077e-05, + "loss": 2.7331, + "loss_": 1.1431, + "moe_loss": 0.1609, + "moe_loss_longrong": 1.4341, + "step": 483 + }, + { + "epoch": 0.06, + "learning_rate": 1.9948520822138837e-05, + "loss": 2.7357, + "loss_": 1.2042, + "moe_loss": 0.1607, + "moe_loss_longrong": 1.4322, + "step": 490 + }, + { + "epoch": 0.06, + "learning_rate": 1.9945600281662088e-05, + "loss": 2.7075, + "loss_": 0.97, + "moe_loss": 0.1614, + "moe_loss_longrong": 1.4623, + "step": 497 + }, + { + "epoch": 0.06, + "learning_rate": 1.9942599392025488e-05, + "loss": 2.7172, + "loss_": 1.2653, + "moe_loss": 0.1608, + "moe_loss_longrong": 1.4338, + "step": 504 + }, + { + "epoch": 0.06, + "learning_rate": 1.9939518177472813e-05, + "loss": 2.718, + "loss_": 1.1215, + "moe_loss": 0.1608, + "moe_loss_longrong": 1.4347, + "step": 511 + }, + { + "epoch": 0.06, + "learning_rate": 1.9936356662896777e-05, + "loss": 2.7166, + "loss_": 0.9601, + "moe_loss": 0.1607, + "moe_loss_longrong": 1.432, + "step": 518 + }, + { + "epoch": 0.07, + "learning_rate": 1.9933114873838832e-05, + "loss": 2.711, + "loss_": 1.2031, + "moe_loss": 0.1606, + "moe_loss_longrong": 1.4313, + "step": 525 + }, + { + "epoch": 0.07, + "learning_rate": 1.9929792836488954e-05, + "loss": 2.7297, + "loss_": 1.0668, + "moe_loss": 0.1607, + "moe_loss_longrong": 1.4315, + "step": 532 + }, + { + "epoch": 0.07, + "learning_rate": 1.9926390577685434e-05, + "loss": 2.7135, + "loss_": 0.8892, + "moe_loss": 0.1616, + "moe_loss_longrong": 1.461, + "step": 539 + }, + { + "epoch": 0.07, + "learning_rate": 1.992290812491466e-05, + "loss": 2.676, + "loss_": 1.2917, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4313, + "step": 546 + }, + { + "epoch": 0.07, + "learning_rate": 1.9919345506310896e-05, + "loss": 2.6813, + "loss_": 1.4059, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4321, + "step": 553 + }, + { + "epoch": 0.07, + "learning_rate": 1.9915702750656053e-05, + "loss": 2.7125, + "loss_": 1.085, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4315, + "step": 560 + }, + { + "epoch": 0.07, + "learning_rate": 1.991197988737947e-05, + "loss": 2.7119, + "loss_": 1.1287, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4303, + "step": 567 + }, + { + "epoch": 0.07, + "learning_rate": 1.9908176946557646e-05, + "loss": 2.6879, + "loss_": 1.1955, + "moe_loss": 0.1606, + "moe_loss_longrong": 1.4301, + "step": 574 + }, + { + "epoch": 0.07, + "learning_rate": 1.9904293958914032e-05, + "loss": 2.7081, + "loss_": 1.3866, + "moe_loss": 0.1606, + "moe_loss_longrong": 1.4304, + "step": 581 + }, + { + "epoch": 0.07, + "learning_rate": 1.990033095581876e-05, + "loss": 2.7152, + "loss_": 1.2814, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4308, + "step": 588 + }, + { + "epoch": 0.07, + "learning_rate": 1.9896287969288396e-05, + "loss": 2.6944, + "loss_": 1.2075, + "moe_loss": 0.1606, + "moe_loss_longrong": 1.4292, + "step": 595 + }, + { + "epoch": 0.08, + "learning_rate": 1.989216503198568e-05, + "loss": 2.6422, + "loss_": 0.9899, + "moe_loss": 0.1617, + "moe_loss_longrong": 1.458, + "step": 602 + }, + { + "epoch": 0.08, + "learning_rate": 1.988796217721926e-05, + "loss": 2.7321, + "loss_": 1.437, + "moe_loss": 0.1606, + "moe_loss_longrong": 1.4304, + "step": 609 + }, + { + "epoch": 0.08, + "learning_rate": 1.9883679438943444e-05, + "loss": 2.6757, + "loss_": 1.3196, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4289, + "step": 616 + }, + { + "epoch": 0.08, + "learning_rate": 1.9879316851757885e-05, + "loss": 2.688, + "loss_": 1.514, + "moe_loss": 0.1606, + "moe_loss_longrong": 1.429, + "step": 623 + }, + { + "epoch": 0.08, + "learning_rate": 1.9874874450907338e-05, + "loss": 2.7082, + "loss_": 1.1938, + "moe_loss": 0.1607, + "moe_loss_longrong": 1.4291, + "step": 630 + }, + { + "epoch": 0.08, + "learning_rate": 1.987035227228136e-05, + "loss": 2.7276, + "loss_": 1.3401, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.428, + "step": 637 + }, + { + "epoch": 0.08, + "learning_rate": 1.9865750352414016e-05, + "loss": 2.685, + "loss_": 1.2597, + "moe_loss": 0.1606, + "moe_loss_longrong": 1.4279, + "step": 644 + }, + { + "epoch": 0.08, + "learning_rate": 1.9861068728483603e-05, + "loss": 2.7331, + "loss_": 1.3278, + "moe_loss": 0.1606, + "moe_loss_longrong": 1.4274, + "step": 651 + }, + { + "epoch": 0.08, + "learning_rate": 1.985630743831232e-05, + "loss": 2.7276, + "loss_": 1.542, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4281, + "step": 658 + }, + { + "epoch": 0.08, + "learning_rate": 1.985146652036599e-05, + "loss": 2.6914, + "loss_": 1.4054, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.427, + "step": 665 + }, + { + "epoch": 0.08, + "learning_rate": 1.984654601375373e-05, + "loss": 2.6531, + "loss_": 1.5136, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4262, + "step": 672 + }, + { + "epoch": 0.09, + "learning_rate": 1.9841545958227654e-05, + "loss": 2.7346, + "loss_": 1.1452, + "moe_loss": 0.1606, + "moe_loss_longrong": 1.427, + "step": 679 + }, + { + "epoch": 0.09, + "learning_rate": 1.983646639418253e-05, + "loss": 2.707, + "loss_": 1.3454, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4264, + "step": 686 + }, + { + "epoch": 0.09, + "learning_rate": 1.9831307362655473e-05, + "loss": 2.6949, + "loss_": 1.316, + "moe_loss": 0.1606, + "moe_loss_longrong": 1.427, + "step": 693 + }, + { + "epoch": 0.09, + "learning_rate": 1.9826068905325598e-05, + "loss": 2.6725, + "loss_": 0.8014, + "moe_loss": 0.1612, + "moe_loss_longrong": 1.4544, + "step": 700 + }, + { + "epoch": 0.09, + "learning_rate": 1.9820751064513693e-05, + "loss": 2.7006, + "loss_": 1.3368, + "moe_loss": 0.1606, + "moe_loss_longrong": 1.4252, + "step": 707 + }, + { + "epoch": 0.09, + "learning_rate": 1.981535388318188e-05, + "loss": 2.6809, + "loss_": 1.161, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.425, + "step": 714 + }, + { + "epoch": 0.09, + "learning_rate": 1.980987740493325e-05, + "loss": 2.6964, + "loss_": 1.1942, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4257, + "step": 721 + }, + { + "epoch": 0.09, + "learning_rate": 1.9804321674011533e-05, + "loss": 2.6673, + "loss_": 1.1932, + "moe_loss": 0.1606, + "moe_loss_longrong": 1.4267, + "step": 728 + }, + { + "epoch": 0.09, + "learning_rate": 1.979868673530073e-05, + "loss": 2.6938, + "loss_": 1.2555, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4248, + "step": 735 + }, + { + "epoch": 0.09, + "learning_rate": 1.9792972634324744e-05, + "loss": 2.7032, + "loss_": 1.1953, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4241, + "step": 742 + }, + { + "epoch": 0.09, + "learning_rate": 1.9787179417247032e-05, + "loss": 2.6754, + "loss_": 1.1357, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4249, + "step": 749 + }, + { + "epoch": 0.09, + "learning_rate": 1.9781307130870204e-05, + "loss": 2.6969, + "loss_": 1.3238, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4245, + "step": 756 + }, + { + "epoch": 0.1, + "learning_rate": 1.9775355822635675e-05, + "loss": 2.6831, + "loss_": 1.2612, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4237, + "step": 763 + }, + { + "epoch": 0.1, + "learning_rate": 1.976932554062325e-05, + "loss": 2.6701, + "loss_": 1.1135, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4243, + "step": 770 + }, + { + "epoch": 0.1, + "learning_rate": 1.9763216333550768e-05, + "loss": 2.7003, + "loss_": 1.2469, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4236, + "step": 777 + }, + { + "epoch": 0.1, + "learning_rate": 1.9757028250773686e-05, + "loss": 2.6854, + "loss_": 1.3538, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4234, + "step": 784 + }, + { + "epoch": 0.1, + "learning_rate": 1.975076134228469e-05, + "loss": 2.6874, + "loss_": 1.226, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4239, + "step": 791 + }, + { + "epoch": 0.1, + "learning_rate": 1.9744415658713282e-05, + "loss": 2.7152, + "loss_": 1.2979, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4231, + "step": 798 + }, + { + "epoch": 0.1, + "learning_rate": 1.9737991251325384e-05, + "loss": 2.6908, + "loss_": 1.0737, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4238, + "step": 805 + }, + { + "epoch": 0.1, + "learning_rate": 1.9731488172022915e-05, + "loss": 2.7375, + "loss_": 1.416, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4219, + "step": 812 + }, + { + "epoch": 0.1, + "learning_rate": 1.972490647334337e-05, + "loss": 2.6467, + "loss_": 1.0544, + "moe_loss": 0.1606, + "moe_loss_longrong": 1.4236, + "step": 819 + }, + { + "epoch": 0.1, + "learning_rate": 1.971824620845941e-05, + "loss": 2.6613, + "loss_": 1.1665, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4229, + "step": 826 + }, + { + "epoch": 0.1, + "learning_rate": 1.9711507431178403e-05, + "loss": 2.654, + "loss_": 1.2871, + "moe_loss": 0.1606, + "moe_loss_longrong": 1.4239, + "step": 833 + }, + { + "epoch": 0.11, + "learning_rate": 1.9704690195942035e-05, + "loss": 2.6831, + "loss_": 1.4114, + "moe_loss": 0.1606, + "moe_loss_longrong": 1.4235, + "step": 840 + }, + { + "epoch": 0.11, + "learning_rate": 1.9697794557825812e-05, + "loss": 2.7215, + "loss_": 1.367, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4221, + "step": 847 + }, + { + "epoch": 0.11, + "learning_rate": 1.969082057253867e-05, + "loss": 2.6998, + "loss_": 1.0197, + "moe_loss": 0.1606, + "moe_loss_longrong": 1.422, + "step": 854 + }, + { + "epoch": 0.11, + "learning_rate": 1.9683768296422495e-05, + "loss": 2.6869, + "loss_": 1.2449, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4224, + "step": 861 + }, + { + "epoch": 0.11, + "learning_rate": 1.9676637786451665e-05, + "loss": 2.7047, + "loss_": 1.2273, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.422, + "step": 868 + }, + { + "epoch": 0.11, + "learning_rate": 1.966942910023261e-05, + "loss": 2.6873, + "loss_": 0.9599, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4206, + "step": 875 + }, + { + "epoch": 0.11, + "learning_rate": 1.9662142296003335e-05, + "loss": 2.6721, + "loss_": 1.2456, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4221, + "step": 882 + }, + { + "epoch": 0.11, + "learning_rate": 1.965477743263294e-05, + "loss": 2.6481, + "loss_": 1.4271, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4222, + "step": 889 + }, + { + "epoch": 0.11, + "learning_rate": 1.964733456962116e-05, + "loss": 2.6621, + "loss_": 1.1236, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.422, + "step": 896 + }, + { + "epoch": 0.11, + "learning_rate": 1.9639813767097886e-05, + "loss": 2.66, + "loss_": 1.2049, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4193, + "step": 903 + }, + { + "epoch": 0.11, + "learning_rate": 1.9632215085822658e-05, + "loss": 2.7064, + "loss_": 1.2497, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4214, + "step": 910 + }, + { + "epoch": 0.11, + "learning_rate": 1.9624538587184197e-05, + "loss": 2.6533, + "loss_": 1.22, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4211, + "step": 917 + }, + { + "epoch": 0.12, + "learning_rate": 1.9616784333199896e-05, + "loss": 2.644, + "loss_": 1.1443, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4217, + "step": 924 + }, + { + "epoch": 0.12, + "learning_rate": 1.9608952386515327e-05, + "loss": 2.6987, + "loss_": 1.1736, + "moe_loss": 0.1606, + "moe_loss_longrong": 1.4197, + "step": 931 + }, + { + "epoch": 0.12, + "learning_rate": 1.9601042810403725e-05, + "loss": 2.6732, + "loss_": 1.1886, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4196, + "step": 938 + }, + { + "epoch": 0.12, + "learning_rate": 1.959305566876549e-05, + "loss": 2.6806, + "loss_": 1.0944, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4205, + "step": 945 + }, + { + "epoch": 0.12, + "learning_rate": 1.9584991026127655e-05, + "loss": 2.6919, + "loss_": 1.4477, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4204, + "step": 952 + }, + { + "epoch": 0.12, + "learning_rate": 1.957684894764338e-05, + "loss": 2.6751, + "loss_": 1.1916, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4204, + "step": 959 + }, + { + "epoch": 0.12, + "learning_rate": 1.9568629499091413e-05, + "loss": 2.6459, + "loss_": 1.2407, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4207, + "step": 966 + }, + { + "epoch": 0.12, + "learning_rate": 1.9560332746875574e-05, + "loss": 2.6572, + "loss_": 0.8698, + "moe_loss": 0.1613, + "moe_loss_longrong": 1.4436, + "step": 973 + }, + { + "epoch": 0.12, + "learning_rate": 1.9551958758024194e-05, + "loss": 2.6679, + "loss_": 1.3397, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4202, + "step": 980 + }, + { + "epoch": 0.12, + "learning_rate": 1.9543507600189606e-05, + "loss": 2.6673, + "loss_": 1.164, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4188, + "step": 987 + }, + { + "epoch": 0.12, + "learning_rate": 1.9534979341647562e-05, + "loss": 2.6295, + "loss_": 1.3512, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4192, + "step": 994 + }, + { + "epoch": 0.13, + "learning_rate": 1.9526374051296714e-05, + "loss": 2.645, + "loss_": 1.1948, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4185, + "step": 1001 + }, + { + "epoch": 0.13, + "learning_rate": 1.9517691798658042e-05, + "loss": 2.7004, + "loss_": 1.19, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4188, + "step": 1008 + }, + { + "epoch": 0.13, + "learning_rate": 1.9508932653874283e-05, + "loss": 2.6404, + "loss_": 1.4758, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.418, + "step": 1015 + }, + { + "epoch": 0.13, + "learning_rate": 1.9500096687709393e-05, + "loss": 2.6529, + "loss_": 1.1355, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4179, + "step": 1022 + }, + { + "epoch": 0.13, + "learning_rate": 1.9491183971547943e-05, + "loss": 2.6448, + "loss_": 1.3669, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4182, + "step": 1029 + }, + { + "epoch": 0.13, + "learning_rate": 1.948219457739456e-05, + "loss": 2.674, + "loss_": 1.5143, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4184, + "step": 1036 + }, + { + "epoch": 0.13, + "learning_rate": 1.9473128577873346e-05, + "loss": 2.6813, + "loss_": 1.3613, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4184, + "step": 1043 + }, + { + "epoch": 0.13, + "learning_rate": 1.9463986046227284e-05, + "loss": 2.6566, + "loss_": 1.2685, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4179, + "step": 1050 + }, + { + "epoch": 0.13, + "learning_rate": 1.9454767056317654e-05, + "loss": 2.6556, + "loss_": 1.1164, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4171, + "step": 1057 + }, + { + "epoch": 0.13, + "learning_rate": 1.9445471682623425e-05, + "loss": 2.6723, + "loss_": 1.3762, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4178, + "step": 1064 + }, + { + "epoch": 0.13, + "learning_rate": 1.9436100000240668e-05, + "loss": 2.6654, + "loss_": 1.3065, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4181, + "step": 1071 + }, + { + "epoch": 0.14, + "learning_rate": 1.9426652084881934e-05, + "loss": 2.6471, + "loss_": 0.7216, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4175, + "step": 1078 + }, + { + "epoch": 0.14, + "learning_rate": 1.9417128012875657e-05, + "loss": 2.6433, + "loss_": 1.311, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4172, + "step": 1085 + }, + { + "epoch": 0.14, + "learning_rate": 1.9407527861165523e-05, + "loss": 2.6788, + "loss_": 1.4472, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4174, + "step": 1092 + }, + { + "epoch": 0.14, + "learning_rate": 1.9397851707309864e-05, + "loss": 2.6715, + "loss_": 1.2477, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4167, + "step": 1099 + }, + { + "epoch": 0.14, + "learning_rate": 1.9388099629481017e-05, + "loss": 2.6497, + "loss_": 1.2279, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4171, + "step": 1106 + }, + { + "epoch": 0.14, + "learning_rate": 1.93782717064647e-05, + "loss": 2.6772, + "loss_": 1.0676, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4161, + "step": 1113 + }, + { + "epoch": 0.14, + "learning_rate": 1.9368368017659368e-05, + "loss": 2.6543, + "loss_": 1.3057, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4164, + "step": 1120 + }, + { + "epoch": 0.14, + "learning_rate": 1.9358388643075597e-05, + "loss": 2.6439, + "loss_": 1.2984, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4155, + "step": 1127 + }, + { + "epoch": 0.14, + "learning_rate": 1.9348333663335393e-05, + "loss": 2.6489, + "loss_": 1.1934, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.415, + "step": 1134 + }, + { + "epoch": 0.14, + "learning_rate": 1.9338203159671584e-05, + "loss": 2.6834, + "loss_": 1.2899, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4166, + "step": 1141 + }, + { + "epoch": 0.14, + "learning_rate": 1.9327997213927136e-05, + "loss": 2.6676, + "loss_": 1.3016, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4161, + "step": 1148 + }, + { + "epoch": 0.14, + "learning_rate": 1.931771590855451e-05, + "loss": 2.6612, + "loss_": 1.0128, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4157, + "step": 1155 + }, + { + "epoch": 0.15, + "learning_rate": 1.9307359326614975e-05, + "loss": 2.6457, + "loss_": 1.1214, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4161, + "step": 1162 + }, + { + "epoch": 0.15, + "learning_rate": 1.929692755177796e-05, + "loss": 2.6583, + "loss_": 1.2741, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4164, + "step": 1169 + }, + { + "epoch": 0.15, + "learning_rate": 1.9286420668320356e-05, + "loss": 2.6487, + "loss_": 0.9804, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.416, + "step": 1176 + }, + { + "epoch": 0.15, + "learning_rate": 1.9275838761125866e-05, + "loss": 2.6338, + "loss_": 1.1369, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4158, + "step": 1183 + }, + { + "epoch": 0.15, + "learning_rate": 1.926518191568428e-05, + "loss": 2.6547, + "loss_": 1.2162, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4163, + "step": 1190 + }, + { + "epoch": 0.15, + "learning_rate": 1.9254450218090814e-05, + "loss": 2.6478, + "loss_": 1.1011, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.415, + "step": 1197 + }, + { + "epoch": 0.15, + "learning_rate": 1.92436437550454e-05, + "loss": 2.6527, + "loss_": 1.2542, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4151, + "step": 1204 + }, + { + "epoch": 0.15, + "learning_rate": 1.9232762613851993e-05, + "loss": 2.6584, + "loss_": 1.269, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.414, + "step": 1211 + }, + { + "epoch": 0.15, + "learning_rate": 1.922180688241786e-05, + "loss": 2.6481, + "loss_": 1.1536, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4155, + "step": 1218 + }, + { + "epoch": 0.15, + "learning_rate": 1.9210776649252875e-05, + "loss": 2.6695, + "loss_": 1.318, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4146, + "step": 1225 + }, + { + "epoch": 0.15, + "learning_rate": 1.9199672003468795e-05, + "loss": 2.6144, + "loss_": 1.1917, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4145, + "step": 1232 + }, + { + "epoch": 0.16, + "learning_rate": 1.918849303477856e-05, + "loss": 2.6512, + "loss_": 1.3762, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4143, + "step": 1239 + }, + { + "epoch": 0.16, + "learning_rate": 1.9177239833495545e-05, + "loss": 2.6538, + "loss_": 1.3326, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4144, + "step": 1246 + }, + { + "epoch": 0.16, + "learning_rate": 1.9165912490532838e-05, + "loss": 2.6337, + "loss_": 1.3393, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4141, + "step": 1253 + }, + { + "epoch": 0.16, + "learning_rate": 1.9154511097402512e-05, + "loss": 2.6493, + "loss_": 1.3026, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4143, + "step": 1260 + }, + { + "epoch": 0.16, + "learning_rate": 1.9143035746214883e-05, + "loss": 2.6833, + "loss_": 1.2821, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4147, + "step": 1267 + }, + { + "epoch": 0.16, + "learning_rate": 1.9131486529677755e-05, + "loss": 2.6348, + "loss_": 1.2194, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4144, + "step": 1274 + }, + { + "epoch": 0.16, + "learning_rate": 1.9119863541095697e-05, + "loss": 2.622, + "loss_": 1.4341, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4139, + "step": 1281 + }, + { + "epoch": 0.16, + "learning_rate": 1.9108166874369253e-05, + "loss": 2.6579, + "loss_": 1.1947, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4141, + "step": 1288 + }, + { + "epoch": 0.16, + "learning_rate": 1.9096396623994215e-05, + "loss": 2.6413, + "loss_": 1.3734, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4131, + "step": 1295 + }, + { + "epoch": 0.16, + "learning_rate": 1.9084552885060846e-05, + "loss": 2.6371, + "loss_": 1.2291, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.413, + "step": 1302 + }, + { + "epoch": 0.16, + "learning_rate": 1.9072635753253112e-05, + "loss": 2.6483, + "loss_": 1.1361, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4127, + "step": 1309 + }, + { + "epoch": 0.16, + "learning_rate": 1.9060645324847904e-05, + "loss": 2.6325, + "loss_": 1.3775, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4128, + "step": 1316 + }, + { + "epoch": 0.17, + "learning_rate": 1.9048581696714276e-05, + "loss": 2.6272, + "loss_": 1.2366, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4132, + "step": 1323 + }, + { + "epoch": 0.17, + "learning_rate": 1.9036444966312652e-05, + "loss": 2.6566, + "loss_": 1.2485, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4132, + "step": 1330 + }, + { + "epoch": 0.17, + "learning_rate": 1.9024235231694024e-05, + "loss": 2.6189, + "loss_": 1.3857, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4133, + "step": 1337 + }, + { + "epoch": 0.17, + "learning_rate": 1.90119525914992e-05, + "loss": 2.6107, + "loss_": 1.0882, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4129, + "step": 1344 + }, + { + "epoch": 0.17, + "learning_rate": 1.899959714495796e-05, + "loss": 2.6564, + "loss_": 1.0952, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4127, + "step": 1351 + }, + { + "epoch": 0.17, + "learning_rate": 1.8987168991888293e-05, + "loss": 2.648, + "loss_": 0.9829, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4129, + "step": 1358 + }, + { + "epoch": 0.17, + "learning_rate": 1.8974668232695562e-05, + "loss": 2.6334, + "loss_": 1.1611, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4121, + "step": 1365 + }, + { + "epoch": 0.17, + "learning_rate": 1.896209496837171e-05, + "loss": 2.6435, + "loss_": 1.351, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4122, + "step": 1372 + }, + { + "epoch": 0.17, + "learning_rate": 1.8949449300494444e-05, + "loss": 2.6572, + "loss_": 1.2158, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4125, + "step": 1379 + }, + { + "epoch": 0.17, + "learning_rate": 1.8936731331226402e-05, + "loss": 2.6249, + "loss_": 1.2495, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4127, + "step": 1386 + }, + { + "epoch": 0.17, + "learning_rate": 1.892394116331434e-05, + "loss": 2.6299, + "loss_": 1.0987, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4117, + "step": 1393 + }, + { + "epoch": 0.18, + "learning_rate": 1.8911078900088295e-05, + "loss": 2.6377, + "loss_": 1.179, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4116, + "step": 1400 + }, + { + "epoch": 0.18, + "learning_rate": 1.8898144645460744e-05, + "loss": 2.6133, + "loss_": 1.1341, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.412, + "step": 1407 + }, + { + "epoch": 0.18, + "learning_rate": 1.8885138503925793e-05, + "loss": 2.6514, + "loss_": 1.1486, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4115, + "step": 1414 + }, + { + "epoch": 0.18, + "learning_rate": 1.8872060580558295e-05, + "loss": 2.6529, + "loss_": 1.2706, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4111, + "step": 1421 + }, + { + "epoch": 0.18, + "learning_rate": 1.8858910981013025e-05, + "loss": 2.6298, + "loss_": 1.2814, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4114, + "step": 1428 + }, + { + "epoch": 0.18, + "learning_rate": 1.884568981152382e-05, + "loss": 2.6107, + "loss_": 0.8031, + "moe_loss": 0.161, + "moe_loss_longrong": 1.433, + "step": 1435 + }, + { + "epoch": 0.18, + "learning_rate": 1.883239717890272e-05, + "loss": 2.6321, + "loss_": 1.3342, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4109, + "step": 1442 + }, + { + "epoch": 0.18, + "learning_rate": 1.881903319053911e-05, + "loss": 2.6271, + "loss_": 1.224, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.412, + "step": 1449 + }, + { + "epoch": 0.18, + "learning_rate": 1.880559795439884e-05, + "loss": 2.6168, + "loss_": 1.0488, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4113, + "step": 1456 + }, + { + "epoch": 0.18, + "learning_rate": 1.8792091579023365e-05, + "loss": 2.6358, + "loss_": 1.0113, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.411, + "step": 1463 + }, + { + "epoch": 0.18, + "learning_rate": 1.8778514173528873e-05, + "loss": 2.6396, + "loss_": 1.1213, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4112, + "step": 1470 + }, + { + "epoch": 0.19, + "learning_rate": 1.8764865847605384e-05, + "loss": 2.6268, + "loss_": 1.0843, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4125, + "step": 1477 + }, + { + "epoch": 0.19, + "learning_rate": 1.875114671151587e-05, + "loss": 2.6604, + "loss_": 1.1708, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4112, + "step": 1484 + }, + { + "epoch": 0.19, + "learning_rate": 1.8737356876095387e-05, + "loss": 2.6187, + "loss_": 1.0976, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4107, + "step": 1491 + }, + { + "epoch": 0.19, + "learning_rate": 1.8723496452750146e-05, + "loss": 2.6198, + "loss_": 1.1692, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4107, + "step": 1498 + }, + { + "epoch": 0.19, + "learning_rate": 1.8709565553456632e-05, + "loss": 2.621, + "loss_": 1.3206, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4102, + "step": 1505 + }, + { + "epoch": 0.19, + "learning_rate": 1.86955642907607e-05, + "loss": 2.6184, + "loss_": 1.1808, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4106, + "step": 1512 + }, + { + "epoch": 0.19, + "learning_rate": 1.8681492777776656e-05, + "loss": 2.6577, + "loss_": 1.1146, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4105, + "step": 1519 + }, + { + "epoch": 0.19, + "learning_rate": 1.8667351128186347e-05, + "loss": 2.6417, + "loss_": 1.3074, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4109, + "step": 1526 + }, + { + "epoch": 0.19, + "learning_rate": 1.8653139456238257e-05, + "loss": 2.6165, + "loss_": 1.0176, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4104, + "step": 1533 + }, + { + "epoch": 0.19, + "learning_rate": 1.8638857876746556e-05, + "loss": 2.6841, + "loss_": 1.3367, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4101, + "step": 1540 + }, + { + "epoch": 0.19, + "learning_rate": 1.8624506505090192e-05, + "loss": 2.6275, + "loss_": 1.1427, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4107, + "step": 1547 + }, + { + "epoch": 0.19, + "learning_rate": 1.8610085457211958e-05, + "loss": 2.6526, + "loss_": 1.3939, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4104, + "step": 1554 + }, + { + "epoch": 0.2, + "learning_rate": 1.8595594849617552e-05, + "loss": 2.6202, + "loss_": 1.1005, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4099, + "step": 1561 + }, + { + "epoch": 0.2, + "learning_rate": 1.8581034799374632e-05, + "loss": 2.6634, + "loss_": 1.2608, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4098, + "step": 1568 + }, + { + "epoch": 0.2, + "learning_rate": 1.8566405424111873e-05, + "loss": 2.6483, + "loss_": 1.2839, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4097, + "step": 1575 + }, + { + "epoch": 0.2, + "learning_rate": 1.855170684201802e-05, + "loss": 2.6077, + "loss_": 1.3359, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.41, + "step": 1582 + }, + { + "epoch": 0.2, + "learning_rate": 1.8536939171840934e-05, + "loss": 2.6574, + "loss_": 1.3449, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4099, + "step": 1589 + }, + { + "epoch": 0.2, + "learning_rate": 1.8522102532886627e-05, + "loss": 2.6374, + "loss_": 1.0245, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4102, + "step": 1596 + }, + { + "epoch": 0.2, + "learning_rate": 1.8507197045018286e-05, + "loss": 2.6555, + "loss_": 1.2334, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4094, + "step": 1603 + }, + { + "epoch": 0.2, + "learning_rate": 1.8492222828655347e-05, + "loss": 2.6118, + "loss_": 1.294, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4088, + "step": 1610 + }, + { + "epoch": 0.2, + "learning_rate": 1.8477180004772473e-05, + "loss": 2.6092, + "loss_": 1.1013, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4087, + "step": 1617 + }, + { + "epoch": 0.2, + "learning_rate": 1.8462068694898603e-05, + "loss": 2.6415, + "loss_": 1.1863, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4088, + "step": 1624 + }, + { + "epoch": 0.2, + "learning_rate": 1.8446889021115967e-05, + "loss": 2.6141, + "loss_": 1.2587, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4091, + "step": 1631 + }, + { + "epoch": 0.21, + "learning_rate": 1.84316411060591e-05, + "loss": 2.6031, + "loss_": 0.798, + "moe_loss": 0.1608, + "moe_loss_longrong": 1.4271, + "step": 1638 + }, + { + "epoch": 0.21, + "learning_rate": 1.841632507291384e-05, + "loss": 2.6305, + "loss_": 0.9585, + "moe_loss": 0.161, + "moe_loss_longrong": 1.427, + "step": 1645 + }, + { + "epoch": 0.21, + "learning_rate": 1.8400941045416352e-05, + "loss": 2.5888, + "loss_": 1.2668, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4088, + "step": 1652 + }, + { + "epoch": 0.21, + "learning_rate": 1.8385489147852117e-05, + "loss": 2.6253, + "loss_": 1.0907, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4084, + "step": 1659 + }, + { + "epoch": 0.21, + "learning_rate": 1.8369969505054915e-05, + "loss": 2.6541, + "loss_": 1.1209, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4083, + "step": 1666 + }, + { + "epoch": 0.21, + "learning_rate": 1.8354382242405853e-05, + "loss": 2.6553, + "loss_": 1.1877, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4086, + "step": 1673 + }, + { + "epoch": 0.21, + "learning_rate": 1.8338727485832317e-05, + "loss": 2.6105, + "loss_": 1.0542, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4087, + "step": 1680 + }, + { + "epoch": 0.21, + "learning_rate": 1.832300536180696e-05, + "loss": 2.6209, + "loss_": 1.3141, + "moe_loss": 0.1608, + "moe_loss_longrong": 1.4275, + "step": 1687 + }, + { + "epoch": 0.21, + "learning_rate": 1.8307215997346703e-05, + "loss": 2.6477, + "loss_": 1.3156, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4087, + "step": 1694 + }, + { + "epoch": 0.21, + "learning_rate": 1.8291359520011687e-05, + "loss": 2.6633, + "loss_": 1.2031, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4075, + "step": 1701 + }, + { + "epoch": 0.21, + "learning_rate": 1.8275436057904246e-05, + "loss": 2.6259, + "loss_": 1.1971, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4082, + "step": 1708 + }, + { + "epoch": 0.21, + "learning_rate": 1.825944573966788e-05, + "loss": 2.6185, + "loss_": 1.098, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4073, + "step": 1715 + }, + { + "epoch": 0.22, + "learning_rate": 1.82433886944862e-05, + "loss": 2.614, + "loss_": 1.0326, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4079, + "step": 1722 + }, + { + "epoch": 0.22, + "learning_rate": 1.8227265052081913e-05, + "loss": 2.6257, + "loss_": 1.3002, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4078, + "step": 1729 + }, + { + "epoch": 0.22, + "learning_rate": 1.821107494271574e-05, + "loss": 2.6101, + "loss_": 1.3726, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4089, + "step": 1736 + }, + { + "epoch": 0.22, + "learning_rate": 1.8194818497185385e-05, + "loss": 2.6377, + "loss_": 1.2734, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4081, + "step": 1743 + }, + { + "epoch": 0.22, + "learning_rate": 1.8178495846824474e-05, + "loss": 2.6187, + "loss_": 1.062, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4082, + "step": 1750 + }, + { + "epoch": 0.22, + "learning_rate": 1.81621071235015e-05, + "loss": 2.6136, + "loss_": 1.3893, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4079, + "step": 1757 + }, + { + "epoch": 0.22, + "learning_rate": 1.814565245961873e-05, + "loss": 2.6332, + "loss_": 1.3444, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4075, + "step": 1764 + }, + { + "epoch": 0.22, + "learning_rate": 1.8129131988111174e-05, + "loss": 2.6251, + "loss_": 1.0967, + "moe_loss": 0.161, + "moe_loss_longrong": 1.4258, + "step": 1771 + }, + { + "epoch": 0.22, + "learning_rate": 1.8112545842445488e-05, + "loss": 2.6364, + "loss_": 1.1797, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4066, + "step": 1778 + }, + { + "epoch": 0.22, + "learning_rate": 1.80958941566189e-05, + "loss": 2.618, + "loss_": 1.2443, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4075, + "step": 1785 + }, + { + "epoch": 0.22, + "learning_rate": 1.807917706515813e-05, + "loss": 2.5878, + "loss_": 1.4678, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4082, + "step": 1792 + }, + { + "epoch": 0.23, + "learning_rate": 1.8062394703118294e-05, + "loss": 2.6224, + "loss_": 1.1059, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4077, + "step": 1799 + }, + { + "epoch": 0.23, + "learning_rate": 1.804554720608183e-05, + "loss": 2.6185, + "loss_": 1.2769, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4077, + "step": 1806 + }, + { + "epoch": 0.23, + "learning_rate": 1.8028634710157392e-05, + "loss": 2.5904, + "loss_": 1.1422, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4071, + "step": 1813 + }, + { + "epoch": 0.23, + "learning_rate": 1.801165735197874e-05, + "loss": 2.5663, + "loss_": 0.9287, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4073, + "step": 1820 + }, + { + "epoch": 0.23, + "learning_rate": 1.7994615268703655e-05, + "loss": 2.6135, + "loss_": 1.2268, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4064, + "step": 1827 + }, + { + "epoch": 0.23, + "learning_rate": 1.7977508598012834e-05, + "loss": 2.5989, + "loss_": 1.1974, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4069, + "step": 1834 + }, + { + "epoch": 0.23, + "learning_rate": 1.7960337478108743e-05, + "loss": 2.5877, + "loss_": 1.13, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4064, + "step": 1841 + }, + { + "epoch": 0.23, + "learning_rate": 1.7943102047714548e-05, + "loss": 2.5955, + "loss_": 0.9327, + "moe_loss": 0.1609, + "moe_loss_longrong": 1.4235, + "step": 1848 + }, + { + "epoch": 0.23, + "learning_rate": 1.7925802446072957e-05, + "loss": 2.596, + "loss_": 1.2759, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4068, + "step": 1855 + }, + { + "epoch": 0.23, + "learning_rate": 1.7908438812945106e-05, + "loss": 2.6038, + "loss_": 1.1389, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4064, + "step": 1862 + }, + { + "epoch": 0.23, + "learning_rate": 1.7891011288609454e-05, + "loss": 2.585, + "loss_": 1.3728, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4061, + "step": 1869 + }, + { + "epoch": 0.24, + "learning_rate": 1.7873520013860595e-05, + "loss": 2.6263, + "loss_": 1.3243, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4064, + "step": 1876 + }, + { + "epoch": 0.24, + "learning_rate": 1.7855965130008188e-05, + "loss": 2.6254, + "loss_": 1.2195, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4066, + "step": 1883 + }, + { + "epoch": 0.24, + "learning_rate": 1.783834677887576e-05, + "loss": 2.6312, + "loss_": 0.8851, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4064, + "step": 1890 + }, + { + "epoch": 0.24, + "learning_rate": 1.782066510279959e-05, + "loss": 2.6333, + "loss_": 1.3439, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4063, + "step": 1897 + }, + { + "epoch": 0.24, + "learning_rate": 1.7802920244627543e-05, + "loss": 2.6112, + "loss_": 1.1944, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4063, + "step": 1904 + }, + { + "epoch": 0.24, + "learning_rate": 1.778511234771793e-05, + "loss": 2.614, + "loss_": 1.3766, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4055, + "step": 1911 + }, + { + "epoch": 0.24, + "learning_rate": 1.776724155593835e-05, + "loss": 2.5922, + "loss_": 1.1792, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4058, + "step": 1918 + }, + { + "epoch": 0.24, + "learning_rate": 1.7749308013664503e-05, + "loss": 2.6604, + "loss_": 1.1936, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4058, + "step": 1925 + }, + { + "epoch": 0.24, + "learning_rate": 1.7731311865779058e-05, + "loss": 2.6211, + "loss_": 1.235, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4057, + "step": 1932 + }, + { + "epoch": 0.24, + "learning_rate": 1.771325325767046e-05, + "loss": 2.6152, + "loss_": 1.1696, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4055, + "step": 1939 + }, + { + "epoch": 0.24, + "learning_rate": 1.7695132335231758e-05, + "loss": 2.6476, + "loss_": 1.2283, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4054, + "step": 1946 + }, + { + "epoch": 0.24, + "learning_rate": 1.7676949244859435e-05, + "loss": 2.6351, + "loss_": 1.0637, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4051, + "step": 1953 + }, + { + "epoch": 0.25, + "learning_rate": 1.7658704133452228e-05, + "loss": 2.6196, + "loss_": 1.3258, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4058, + "step": 1960 + }, + { + "epoch": 0.25, + "learning_rate": 1.764039714840991e-05, + "loss": 2.5882, + "loss_": 0.9733, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4048, + "step": 1967 + }, + { + "epoch": 0.25, + "learning_rate": 1.7622028437632154e-05, + "loss": 2.6128, + "loss_": 1.2358, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4052, + "step": 1974 + }, + { + "epoch": 0.25, + "learning_rate": 1.7603598149517277e-05, + "loss": 2.6192, + "loss_": 0.9268, + "moe_loss": 0.1607, + "moe_loss_longrong": 1.4196, + "step": 1981 + }, + { + "epoch": 0.25, + "learning_rate": 1.7585106432961093e-05, + "loss": 2.593, + "loss_": 1.0061, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4049, + "step": 1988 + }, + { + "epoch": 0.25, + "learning_rate": 1.7566553437355674e-05, + "loss": 2.6141, + "loss_": 1.083, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4046, + "step": 1995 + }, + { + "epoch": 0.25, + "learning_rate": 1.754793931258817e-05, + "loss": 2.6423, + "loss_": 1.0028, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4047, + "step": 2002 + }, + { + "epoch": 0.25, + "learning_rate": 1.7529264209039573e-05, + "loss": 2.5863, + "loss_": 1.4222, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4047, + "step": 2009 + }, + { + "epoch": 0.25, + "learning_rate": 1.751052827758352e-05, + "loss": 2.6299, + "loss_": 0.9747, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4046, + "step": 2016 + }, + { + "epoch": 0.25, + "learning_rate": 1.7491731669585066e-05, + "loss": 2.6117, + "loss_": 1.1316, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4043, + "step": 2023 + }, + { + "epoch": 0.25, + "learning_rate": 1.747287453689947e-05, + "loss": 2.6125, + "loss_": 0.9258, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.405, + "step": 2030 + }, + { + "epoch": 0.26, + "learning_rate": 1.745395703187095e-05, + "loss": 2.6568, + "loss_": 1.2147, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4046, + "step": 2037 + }, + { + "epoch": 0.26, + "learning_rate": 1.7434979307331482e-05, + "loss": 2.6449, + "loss_": 1.033, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4042, + "step": 2044 + }, + { + "epoch": 0.26, + "learning_rate": 1.7415941516599525e-05, + "loss": 2.6137, + "loss_": 1.2328, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4039, + "step": 2051 + }, + { + "epoch": 0.26, + "learning_rate": 1.7396843813478825e-05, + "loss": 2.6196, + "loss_": 1.1898, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.404, + "step": 2058 + }, + { + "epoch": 0.26, + "learning_rate": 1.7377686352257136e-05, + "loss": 2.6021, + "loss_": 1.2029, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4045, + "step": 2065 + }, + { + "epoch": 0.26, + "learning_rate": 1.7358469287705e-05, + "loss": 2.6354, + "loss_": 1.368, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4045, + "step": 2072 + }, + { + "epoch": 0.26, + "learning_rate": 1.7339192775074486e-05, + "loss": 2.619, + "loss_": 1.4305, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4048, + "step": 2079 + }, + { + "epoch": 0.26, + "learning_rate": 1.7319856970097927e-05, + "loss": 2.6185, + "loss_": 1.19, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4043, + "step": 2086 + }, + { + "epoch": 0.26, + "learning_rate": 1.730046202898668e-05, + "loss": 2.589, + "loss_": 1.1368, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4042, + "step": 2093 + }, + { + "epoch": 0.26, + "learning_rate": 1.7281008108429854e-05, + "loss": 2.6104, + "loss_": 1.02, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4044, + "step": 2100 + }, + { + "epoch": 0.26, + "learning_rate": 1.726149536559304e-05, + "loss": 2.6138, + "loss_": 1.2443, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.404, + "step": 2107 + }, + { + "epoch": 0.26, + "learning_rate": 1.7241923958117047e-05, + "loss": 2.6079, + "loss_": 1.272, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4039, + "step": 2114 + }, + { + "epoch": 0.27, + "learning_rate": 1.7222294044116637e-05, + "loss": 2.6155, + "loss_": 1.2334, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4037, + "step": 2121 + }, + { + "epoch": 0.27, + "learning_rate": 1.7202605782179223e-05, + "loss": 2.6217, + "loss_": 1.1778, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4039, + "step": 2128 + }, + { + "epoch": 0.27, + "learning_rate": 1.718285933136361e-05, + "loss": 2.6156, + "loss_": 1.0468, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4037, + "step": 2135 + }, + { + "epoch": 0.27, + "learning_rate": 1.7163054851198712e-05, + "loss": 2.6145, + "loss_": 1.2375, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4037, + "step": 2142 + }, + { + "epoch": 0.27, + "learning_rate": 1.7143192501682243e-05, + "loss": 2.6167, + "loss_": 1.325, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4036, + "step": 2149 + }, + { + "epoch": 0.27, + "learning_rate": 1.712327244327944e-05, + "loss": 2.5924, + "loss_": 0.955, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4035, + "step": 2156 + }, + { + "epoch": 0.27, + "learning_rate": 1.7103294836921752e-05, + "loss": 2.6235, + "loss_": 1.0911, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4034, + "step": 2163 + }, + { + "epoch": 0.27, + "learning_rate": 1.708325984400557e-05, + "loss": 2.6047, + "loss_": 0.8419, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4037, + "step": 2170 + }, + { + "epoch": 0.27, + "learning_rate": 1.7063167626390893e-05, + "loss": 2.6268, + "loss_": 1.1833, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4029, + "step": 2177 + }, + { + "epoch": 0.27, + "learning_rate": 1.7043018346400024e-05, + "loss": 2.622, + "loss_": 1.0641, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4028, + "step": 2184 + }, + { + "epoch": 0.27, + "learning_rate": 1.7022812166816277e-05, + "loss": 2.6011, + "loss_": 0.9805, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4028, + "step": 2191 + }, + { + "epoch": 0.28, + "learning_rate": 1.7002549250882637e-05, + "loss": 2.5584, + "loss_": 1.1622, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4025, + "step": 2198 + }, + { + "epoch": 0.28, + "learning_rate": 1.698222976230047e-05, + "loss": 2.607, + "loss_": 1.2586, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4026, + "step": 2205 + }, + { + "epoch": 0.28, + "learning_rate": 1.6961853865228176e-05, + "loss": 2.6328, + "loss_": 1.3466, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4029, + "step": 2212 + }, + { + "epoch": 0.28, + "learning_rate": 1.6941421724279866e-05, + "loss": 2.568, + "loss_": 1.2851, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4028, + "step": 2219 + }, + { + "epoch": 0.28, + "learning_rate": 1.6920933504524048e-05, + "loss": 2.5682, + "loss_": 1.033, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4028, + "step": 2226 + }, + { + "epoch": 0.28, + "learning_rate": 1.6900389371482286e-05, + "loss": 2.5863, + "loss_": 1.1035, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4031, + "step": 2233 + }, + { + "epoch": 0.28, + "learning_rate": 1.6879789491127837e-05, + "loss": 2.5745, + "loss_": 0.9979, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4026, + "step": 2240 + }, + { + "epoch": 0.28, + "learning_rate": 1.685913402988436e-05, + "loss": 2.5738, + "loss_": 1.056, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.403, + "step": 2247 + }, + { + "epoch": 0.28, + "learning_rate": 1.6838423154624534e-05, + "loss": 2.5971, + "loss_": 0.9538, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4021, + "step": 2254 + }, + { + "epoch": 0.28, + "learning_rate": 1.6817657032668715e-05, + "loss": 2.5999, + "loss_": 1.2746, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4024, + "step": 2261 + }, + { + "epoch": 0.28, + "learning_rate": 1.6796835831783597e-05, + "loss": 2.5917, + "loss_": 1.1284, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.4021, + "step": 2268 + }, + { + "epoch": 0.29, + "learning_rate": 1.6775959720180847e-05, + "loss": 2.5756, + "loss_": 1.0512, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.402, + "step": 2275 + }, + { + "epoch": 0.29, + "learning_rate": 1.675502886651574e-05, + "loss": 2.5869, + "loss_": 1.1705, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4019, + "step": 2282 + }, + { + "epoch": 0.29, + "learning_rate": 1.6734043439885826e-05, + "loss": 2.6105, + "loss_": 1.2021, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4019, + "step": 2289 + }, + { + "epoch": 0.29, + "learning_rate": 1.6713003609829518e-05, + "loss": 2.6133, + "loss_": 1.2789, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.402, + "step": 2296 + }, + { + "epoch": 0.29, + "learning_rate": 1.669190954632477e-05, + "loss": 2.6103, + "loss_": 1.0784, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4019, + "step": 2303 + }, + { + "epoch": 0.29, + "learning_rate": 1.667076141978765e-05, + "loss": 2.5459, + "loss_": 1.27, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4021, + "step": 2310 + }, + { + "epoch": 0.29, + "learning_rate": 1.664955940107103e-05, + "loss": 2.5936, + "loss_": 1.0663, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4026, + "step": 2317 + }, + { + "epoch": 0.29, + "learning_rate": 1.662830366146315e-05, + "loss": 2.5879, + "loss_": 0.9998, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4019, + "step": 2324 + }, + { + "epoch": 0.29, + "learning_rate": 1.6606994372686246e-05, + "loss": 2.6045, + "loss_": 1.2394, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.402, + "step": 2331 + }, + { + "epoch": 0.29, + "learning_rate": 1.6585631706895186e-05, + "loss": 2.5902, + "loss_": 1.1972, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4015, + "step": 2338 + }, + { + "epoch": 0.29, + "learning_rate": 1.6564215836676066e-05, + "loss": 2.5844, + "loss_": 0.948, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4019, + "step": 2345 + }, + { + "epoch": 0.29, + "learning_rate": 1.6542746935044793e-05, + "loss": 2.5781, + "loss_": 1.4827, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4025, + "step": 2352 + }, + { + "epoch": 0.3, + "learning_rate": 1.652122517544573e-05, + "loss": 2.5821, + "loss_": 1.0247, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4022, + "step": 2359 + }, + { + "epoch": 0.3, + "learning_rate": 1.6499650731750256e-05, + "loss": 2.6092, + "loss_": 0.8974, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4018, + "step": 2366 + }, + { + "epoch": 0.3, + "learning_rate": 1.647802377825539e-05, + "loss": 2.5766, + "loss_": 1.0648, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4016, + "step": 2373 + }, + { + "epoch": 0.3, + "learning_rate": 1.645634448968236e-05, + "loss": 2.603, + "loss_": 1.244, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4019, + "step": 2380 + }, + { + "epoch": 0.3, + "learning_rate": 1.643461304117521e-05, + "loss": 2.6323, + "loss_": 1.3655, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.4015, + "step": 2387 + }, + { + "epoch": 0.3, + "learning_rate": 1.6412829608299373e-05, + "loss": 2.6053, + "loss_": 1.3408, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4014, + "step": 2394 + }, + { + "epoch": 0.3, + "learning_rate": 1.6390994367040257e-05, + "loss": 2.6053, + "loss_": 1.3031, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4016, + "step": 2401 + }, + { + "epoch": 0.3, + "learning_rate": 1.636910749380183e-05, + "loss": 2.5956, + "loss_": 1.434, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4015, + "step": 2408 + }, + { + "epoch": 0.3, + "learning_rate": 1.634716916540517e-05, + "loss": 2.6072, + "loss_": 1.3772, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4013, + "step": 2415 + }, + { + "epoch": 0.3, + "learning_rate": 1.632517955908707e-05, + "loss": 2.6077, + "loss_": 1.2657, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4017, + "step": 2422 + }, + { + "epoch": 0.3, + "learning_rate": 1.6303138852498594e-05, + "loss": 2.5694, + "loss_": 1.3289, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4014, + "step": 2429 + }, + { + "epoch": 0.31, + "learning_rate": 1.6281047223703625e-05, + "loss": 2.5821, + "loss_": 1.1676, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4008, + "step": 2436 + }, + { + "epoch": 0.31, + "learning_rate": 1.6258904851177434e-05, + "loss": 2.5965, + "loss_": 1.2449, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4013, + "step": 2443 + }, + { + "epoch": 0.31, + "learning_rate": 1.6236711913805273e-05, + "loss": 2.6104, + "loss_": 1.1732, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4008, + "step": 2450 + }, + { + "epoch": 0.31, + "learning_rate": 1.621446859088087e-05, + "loss": 2.5975, + "loss_": 1.2338, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4008, + "step": 2457 + }, + { + "epoch": 0.31, + "learning_rate": 1.619217506210503e-05, + "loss": 2.6063, + "loss_": 1.28, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.401, + "step": 2464 + }, + { + "epoch": 0.31, + "learning_rate": 1.6169831507584152e-05, + "loss": 2.5977, + "loss_": 1.3583, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4011, + "step": 2471 + }, + { + "epoch": 0.31, + "learning_rate": 1.614743810782879e-05, + "loss": 2.6263, + "loss_": 1.4302, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.401, + "step": 2478 + }, + { + "epoch": 0.31, + "learning_rate": 1.61249950437522e-05, + "loss": 2.6303, + "loss_": 1.0776, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.401, + "step": 2485 + }, + { + "epoch": 0.31, + "learning_rate": 1.610250249666886e-05, + "loss": 2.5851, + "loss_": 1.0742, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4004, + "step": 2492 + }, + { + "epoch": 0.31, + "learning_rate": 1.6079960648293016e-05, + "loss": 2.5652, + "loss_": 1.2411, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4006, + "step": 2499 + }, + { + "epoch": 0.31, + "learning_rate": 1.605736968073721e-05, + "loss": 2.5674, + "loss_": 1.3629, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4002, + "step": 2506 + }, + { + "epoch": 0.32, + "learning_rate": 1.6034729776510817e-05, + "loss": 2.5844, + "loss_": 1.2259, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4007, + "step": 2513 + }, + { + "epoch": 0.32, + "learning_rate": 1.6012041118518558e-05, + "loss": 2.592, + "loss_": 1.3237, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4005, + "step": 2520 + }, + { + "epoch": 0.32, + "learning_rate": 1.598930389005904e-05, + "loss": 2.5949, + "loss_": 1.1398, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4003, + "step": 2527 + }, + { + "epoch": 0.32, + "learning_rate": 1.596651827482325e-05, + "loss": 2.5823, + "loss_": 0.984, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4, + "step": 2534 + }, + { + "epoch": 0.32, + "learning_rate": 1.5943684456893103e-05, + "loss": 2.5586, + "loss_": 0.8138, + "moe_loss": 0.1606, + "moe_loss_longrong": 1.412, + "step": 2541 + }, + { + "epoch": 0.32, + "learning_rate": 1.5920802620739914e-05, + "loss": 2.6019, + "loss_": 0.9803, + "moe_loss": 0.1606, + "moe_loss_longrong": 1.4116, + "step": 2548 + }, + { + "epoch": 0.32, + "learning_rate": 1.5897872951222946e-05, + "loss": 2.5744, + "loss_": 0.8654, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3999, + "step": 2555 + }, + { + "epoch": 0.32, + "learning_rate": 1.5874895633587904e-05, + "loss": 2.5881, + "loss_": 1.1376, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4001, + "step": 2562 + }, + { + "epoch": 0.32, + "learning_rate": 1.585187085346543e-05, + "loss": 2.6219, + "loss_": 1.1824, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4003, + "step": 2569 + }, + { + "epoch": 0.32, + "learning_rate": 1.5828798796869607e-05, + "loss": 2.5878, + "loss_": 1.2474, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4005, + "step": 2576 + }, + { + "epoch": 0.32, + "learning_rate": 1.5805679650196456e-05, + "loss": 2.5889, + "loss_": 1.3011, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4002, + "step": 2583 + }, + { + "epoch": 0.32, + "learning_rate": 1.5782513600222443e-05, + "loss": 2.5666, + "loss_": 0.8722, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.4005, + "step": 2590 + }, + { + "epoch": 0.33, + "learning_rate": 1.5759300834102952e-05, + "loss": 2.562, + "loss_": 1.0941, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3999, + "step": 2597 + }, + { + "epoch": 0.33, + "learning_rate": 1.5736041539370783e-05, + "loss": 2.5698, + "loss_": 1.1632, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3999, + "step": 2604 + }, + { + "epoch": 0.33, + "learning_rate": 1.5712735903934627e-05, + "loss": 2.6022, + "loss_": 1.1992, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3999, + "step": 2611 + }, + { + "epoch": 0.33, + "learning_rate": 1.568938411607757e-05, + "loss": 2.5882, + "loss_": 1.148, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.4, + "step": 2618 + }, + { + "epoch": 0.33, + "learning_rate": 1.566598636445554e-05, + "loss": 2.5838, + "loss_": 0.9669, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3997, + "step": 2625 + }, + { + "epoch": 0.33, + "learning_rate": 1.5642542838095814e-05, + "loss": 2.5775, + "loss_": 1.1281, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3995, + "step": 2632 + }, + { + "epoch": 0.33, + "learning_rate": 1.5619053726395468e-05, + "loss": 2.5868, + "loss_": 1.0479, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3995, + "step": 2639 + }, + { + "epoch": 0.33, + "learning_rate": 1.5595519219119863e-05, + "loss": 2.6, + "loss_": 0.9972, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3993, + "step": 2646 + }, + { + "epoch": 0.33, + "learning_rate": 1.5571939506401103e-05, + "loss": 2.6007, + "loss_": 1.2232, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3993, + "step": 2653 + }, + { + "epoch": 0.33, + "learning_rate": 1.5548314778736487e-05, + "loss": 2.6087, + "loss_": 1.2657, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3996, + "step": 2660 + }, + { + "epoch": 0.33, + "learning_rate": 1.552464522698701e-05, + "loss": 2.5675, + "loss_": 1.2458, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3992, + "step": 2667 + }, + { + "epoch": 0.34, + "learning_rate": 1.550093104237577e-05, + "loss": 2.5844, + "loss_": 1.1855, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3993, + "step": 2674 + }, + { + "epoch": 0.34, + "learning_rate": 1.5477172416486464e-05, + "loss": 2.6192, + "loss_": 1.2552, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3993, + "step": 2681 + }, + { + "epoch": 0.34, + "learning_rate": 1.5453369541261814e-05, + "loss": 2.5796, + "loss_": 1.3244, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3996, + "step": 2688 + }, + { + "epoch": 0.34, + "learning_rate": 1.5429522609002034e-05, + "loss": 2.5859, + "loss_": 1.1373, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3992, + "step": 2695 + }, + { + "epoch": 0.34, + "learning_rate": 1.540563181236326e-05, + "loss": 2.5702, + "loss_": 1.3094, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.399, + "step": 2702 + }, + { + "epoch": 0.34, + "learning_rate": 1.5381697344356014e-05, + "loss": 2.5893, + "loss_": 1.2666, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3991, + "step": 2709 + }, + { + "epoch": 0.34, + "learning_rate": 1.535771939834362e-05, + "loss": 2.5504, + "loss_": 1.2537, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3992, + "step": 2716 + }, + { + "epoch": 0.34, + "learning_rate": 1.5333698168040664e-05, + "loss": 2.6094, + "loss_": 1.0996, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3992, + "step": 2723 + }, + { + "epoch": 0.34, + "learning_rate": 1.530963384751142e-05, + "loss": 2.6049, + "loss_": 1.2289, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3992, + "step": 2730 + }, + { + "epoch": 0.34, + "learning_rate": 1.5285526631168273e-05, + "loss": 2.5766, + "loss_": 1.115, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3992, + "step": 2737 + }, + { + "epoch": 0.34, + "learning_rate": 1.5261376713770176e-05, + "loss": 2.5589, + "loss_": 1.3787, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3992, + "step": 2744 + }, + { + "epoch": 0.34, + "learning_rate": 1.5237184290421035e-05, + "loss": 2.5508, + "loss_": 1.1691, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3993, + "step": 2751 + }, + { + "epoch": 0.35, + "learning_rate": 1.521294955656817e-05, + "loss": 2.567, + "loss_": 1.0558, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3989, + "step": 2758 + }, + { + "epoch": 0.35, + "learning_rate": 1.5188672708000725e-05, + "loss": 2.636, + "loss_": 1.464, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3991, + "step": 2765 + }, + { + "epoch": 0.35, + "learning_rate": 1.5164353940848068e-05, + "loss": 2.5519, + "loss_": 0.957, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3986, + "step": 2772 + }, + { + "epoch": 0.35, + "learning_rate": 1.5139993451578236e-05, + "loss": 2.6053, + "loss_": 1.2139, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3991, + "step": 2779 + }, + { + "epoch": 0.35, + "learning_rate": 1.5115591436996327e-05, + "loss": 2.5661, + "loss_": 1.424, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3987, + "step": 2786 + }, + { + "epoch": 0.35, + "learning_rate": 1.5091148094242913e-05, + "loss": 2.5659, + "loss_": 0.9377, + "moe_loss": 0.1605, + "moe_loss_longrong": 1.4097, + "step": 2793 + }, + { + "epoch": 0.35, + "learning_rate": 1.5066663620792463e-05, + "loss": 2.5646, + "loss_": 1.0845, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3987, + "step": 2800 + }, + { + "epoch": 0.35, + "learning_rate": 1.5042138214451719e-05, + "loss": 2.5793, + "loss_": 1.1437, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3982, + "step": 2807 + }, + { + "epoch": 0.35, + "learning_rate": 1.5017572073358127e-05, + "loss": 2.5658, + "loss_": 1.1455, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3984, + "step": 2814 + }, + { + "epoch": 0.35, + "learning_rate": 1.4992965395978219e-05, + "loss": 2.5799, + "loss_": 0.9263, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3986, + "step": 2821 + }, + { + "epoch": 0.35, + "learning_rate": 1.4968318381106013e-05, + "loss": 2.6166, + "loss_": 1.1558, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3986, + "step": 2828 + }, + { + "epoch": 0.36, + "learning_rate": 1.4943631227861412e-05, + "loss": 2.5847, + "loss_": 1.2277, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3985, + "step": 2835 + }, + { + "epoch": 0.36, + "learning_rate": 1.4918904135688586e-05, + "loss": 2.5822, + "loss_": 1.3077, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3986, + "step": 2842 + }, + { + "epoch": 0.36, + "learning_rate": 1.4894137304354367e-05, + "loss": 2.5709, + "loss_": 1.1239, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3986, + "step": 2849 + }, + { + "epoch": 0.36, + "learning_rate": 1.4869330933946641e-05, + "loss": 2.6017, + "loss_": 1.3636, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3984, + "step": 2856 + }, + { + "epoch": 0.36, + "learning_rate": 1.4844485224872721e-05, + "loss": 2.5933, + "loss_": 1.1977, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3983, + "step": 2863 + }, + { + "epoch": 0.36, + "learning_rate": 1.481960037785773e-05, + "loss": 2.5739, + "loss_": 1.3326, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3982, + "step": 2870 + }, + { + "epoch": 0.36, + "learning_rate": 1.4794676593942979e-05, + "loss": 2.5793, + "loss_": 1.0945, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3983, + "step": 2877 + }, + { + "epoch": 0.36, + "learning_rate": 1.476971407448435e-05, + "loss": 2.5561, + "loss_": 0.9802, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3987, + "step": 2884 + }, + { + "epoch": 0.36, + "learning_rate": 1.4744713021150665e-05, + "loss": 2.5553, + "loss_": 1.025, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3978, + "step": 2891 + }, + { + "epoch": 0.36, + "learning_rate": 1.4719673635922047e-05, + "loss": 2.5462, + "loss_": 1.1738, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3979, + "step": 2898 + }, + { + "epoch": 0.36, + "learning_rate": 1.4694596121088309e-05, + "loss": 2.58, + "loss_": 1.1755, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3978, + "step": 2905 + }, + { + "epoch": 0.37, + "learning_rate": 1.4669480679247299e-05, + "loss": 2.5715, + "loss_": 1.37, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3982, + "step": 2912 + }, + { + "epoch": 0.37, + "learning_rate": 1.4644327513303281e-05, + "loss": 2.5696, + "loss_": 1.2128, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3979, + "step": 2919 + }, + { + "epoch": 0.37, + "learning_rate": 1.4619136826465277e-05, + "loss": 2.6001, + "loss_": 1.1853, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3983, + "step": 2926 + }, + { + "epoch": 0.37, + "learning_rate": 1.4593908822245437e-05, + "loss": 2.5781, + "loss_": 1.0309, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3977, + "step": 2933 + }, + { + "epoch": 0.37, + "learning_rate": 1.4568643704457404e-05, + "loss": 2.5805, + "loss_": 1.1558, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3981, + "step": 2940 + }, + { + "epoch": 0.37, + "learning_rate": 1.454334167721464e-05, + "loss": 2.5546, + "loss_": 1.1177, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3976, + "step": 2947 + }, + { + "epoch": 0.37, + "learning_rate": 1.4518002944928807e-05, + "loss": 2.5872, + "loss_": 1.4162, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3979, + "step": 2954 + }, + { + "epoch": 0.37, + "learning_rate": 1.4492627712308094e-05, + "loss": 2.5779, + "loss_": 1.3387, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.398, + "step": 2961 + }, + { + "epoch": 0.37, + "learning_rate": 1.4467216184355577e-05, + "loss": 2.5994, + "loss_": 1.3317, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3977, + "step": 2968 + }, + { + "epoch": 0.37, + "learning_rate": 1.4441768566367554e-05, + "loss": 2.5828, + "loss_": 1.0536, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3975, + "step": 2975 + }, + { + "epoch": 0.37, + "learning_rate": 1.4416285063931887e-05, + "loss": 2.5719, + "loss_": 1.1378, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3975, + "step": 2982 + }, + { + "epoch": 0.37, + "learning_rate": 1.4390765882926348e-05, + "loss": 2.5612, + "loss_": 1.2159, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3978, + "step": 2989 + }, + { + "epoch": 0.38, + "learning_rate": 1.4365211229516951e-05, + "loss": 2.5558, + "loss_": 1.1645, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3975, + "step": 2996 + }, + { + "epoch": 0.38, + "learning_rate": 1.433962131015628e-05, + "loss": 2.5854, + "loss_": 1.1987, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3977, + "step": 3003 + }, + { + "epoch": 0.38, + "learning_rate": 1.4313996331581841e-05, + "loss": 2.5635, + "loss_": 1.3072, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3975, + "step": 3010 + }, + { + "epoch": 0.38, + "learning_rate": 1.4288336500814366e-05, + "loss": 2.5645, + "loss_": 1.0643, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3976, + "step": 3017 + }, + { + "epoch": 0.38, + "learning_rate": 1.426264202515616e-05, + "loss": 2.563, + "loss_": 1.2845, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3975, + "step": 3024 + }, + { + "epoch": 0.38, + "learning_rate": 1.4236913112189417e-05, + "loss": 2.5718, + "loss_": 1.175, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3975, + "step": 3031 + }, + { + "epoch": 0.38, + "learning_rate": 1.4211149969774544e-05, + "loss": 2.533, + "loss_": 1.1995, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3975, + "step": 3038 + }, + { + "epoch": 0.38, + "learning_rate": 1.418535280604849e-05, + "loss": 2.5548, + "loss_": 1.2251, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.397, + "step": 3045 + }, + { + "epoch": 0.38, + "learning_rate": 1.4159521829423049e-05, + "loss": 2.5767, + "loss_": 0.8661, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.397, + "step": 3052 + }, + { + "epoch": 0.38, + "learning_rate": 1.4133657248583186e-05, + "loss": 2.584, + "loss_": 0.7004, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4067, + "step": 3059 + }, + { + "epoch": 0.38, + "learning_rate": 1.410775927248536e-05, + "loss": 2.6066, + "loss_": 0.9504, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3974, + "step": 3066 + }, + { + "epoch": 0.39, + "learning_rate": 1.4081828110355806e-05, + "loss": 2.5768, + "loss_": 1.0234, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.397, + "step": 3073 + }, + { + "epoch": 0.39, + "learning_rate": 1.4055863971688886e-05, + "loss": 2.5702, + "loss_": 1.0861, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3972, + "step": 3080 + }, + { + "epoch": 0.39, + "learning_rate": 1.4029867066245363e-05, + "loss": 2.5943, + "loss_": 1.0817, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3969, + "step": 3087 + }, + { + "epoch": 0.39, + "learning_rate": 1.400383760405072e-05, + "loss": 2.5626, + "loss_": 0.9245, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.397, + "step": 3094 + }, + { + "epoch": 0.39, + "learning_rate": 1.3977775795393467e-05, + "loss": 2.5936, + "loss_": 1.3773, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3972, + "step": 3101 + }, + { + "epoch": 0.39, + "learning_rate": 1.3951681850823427e-05, + "loss": 2.5673, + "loss_": 0.8812, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3967, + "step": 3108 + }, + { + "epoch": 0.39, + "learning_rate": 1.392555598115005e-05, + "loss": 2.556, + "loss_": 1.3045, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3968, + "step": 3115 + }, + { + "epoch": 0.39, + "learning_rate": 1.3899398397440704e-05, + "loss": 2.5809, + "loss_": 1.0862, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3969, + "step": 3122 + }, + { + "epoch": 0.39, + "learning_rate": 1.3873209311018974e-05, + "loss": 2.5601, + "loss_": 0.764, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4061, + "step": 3129 + }, + { + "epoch": 0.39, + "learning_rate": 1.3846988933462944e-05, + "loss": 2.5884, + "loss_": 1.0127, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3964, + "step": 3136 + }, + { + "epoch": 0.39, + "learning_rate": 1.3820737476603506e-05, + "loss": 2.5553, + "loss_": 1.1064, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3969, + "step": 3143 + }, + { + "epoch": 0.39, + "learning_rate": 1.3794455152522619e-05, + "loss": 2.5814, + "loss_": 1.2526, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3969, + "step": 3150 + }, + { + "epoch": 0.4, + "learning_rate": 1.3768142173551638e-05, + "loss": 2.5803, + "loss_": 1.084, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3968, + "step": 3157 + }, + { + "epoch": 0.4, + "learning_rate": 1.3741798752269553e-05, + "loss": 2.5698, + "loss_": 1.1246, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3964, + "step": 3164 + }, + { + "epoch": 0.4, + "learning_rate": 1.3715425101501306e-05, + "loss": 2.5792, + "loss_": 1.421, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3965, + "step": 3171 + }, + { + "epoch": 0.4, + "learning_rate": 1.3689021434316057e-05, + "loss": 2.5823, + "loss_": 0.9307, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3965, + "step": 3178 + }, + { + "epoch": 0.4, + "learning_rate": 1.3662587964025456e-05, + "loss": 2.596, + "loss_": 1.0908, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.397, + "step": 3185 + }, + { + "epoch": 0.4, + "learning_rate": 1.363612490418194e-05, + "loss": 2.5583, + "loss_": 1.2338, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3968, + "step": 3192 + }, + { + "epoch": 0.4, + "learning_rate": 1.3609632468576997e-05, + "loss": 2.5646, + "loss_": 1.1325, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3964, + "step": 3199 + }, + { + "epoch": 0.4, + "learning_rate": 1.358311087123942e-05, + "loss": 2.557, + "loss_": 1.0712, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3961, + "step": 3206 + }, + { + "epoch": 0.4, + "learning_rate": 1.3556560326433617e-05, + "loss": 2.5436, + "loss_": 0.9923, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3967, + "step": 3213 + }, + { + "epoch": 0.4, + "learning_rate": 1.3529981048657846e-05, + "loss": 2.5845, + "loss_": 1.101, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3964, + "step": 3220 + }, + { + "epoch": 0.4, + "learning_rate": 1.35033732526425e-05, + "loss": 2.586, + "loss_": 1.153, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3963, + "step": 3227 + }, + { + "epoch": 0.41, + "learning_rate": 1.3476737153348363e-05, + "loss": 2.5813, + "loss_": 1.171, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3963, + "step": 3234 + }, + { + "epoch": 0.41, + "learning_rate": 1.3450072965964878e-05, + "loss": 2.5771, + "loss_": 1.2221, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.396, + "step": 3241 + }, + { + "epoch": 0.41, + "learning_rate": 1.342338090590841e-05, + "loss": 2.5494, + "loss_": 1.0355, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3962, + "step": 3248 + }, + { + "epoch": 0.41, + "learning_rate": 1.3396661188820505e-05, + "loss": 2.611, + "loss_": 1.2927, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3962, + "step": 3255 + }, + { + "epoch": 0.41, + "learning_rate": 1.3369914030566147e-05, + "loss": 2.5692, + "loss_": 0.5951, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3963, + "step": 3262 + }, + { + "epoch": 0.41, + "learning_rate": 1.3343139647232008e-05, + "loss": 2.566, + "loss_": 1.2681, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3958, + "step": 3269 + }, + { + "epoch": 0.41, + "learning_rate": 1.3316338255124708e-05, + "loss": 2.562, + "loss_": 1.0295, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3965, + "step": 3276 + }, + { + "epoch": 0.41, + "learning_rate": 1.3289510070769074e-05, + "loss": 2.5404, + "loss_": 1.3584, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3958, + "step": 3283 + }, + { + "epoch": 0.41, + "learning_rate": 1.3262655310906375e-05, + "loss": 2.5778, + "loss_": 1.018, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3956, + "step": 3290 + }, + { + "epoch": 0.41, + "learning_rate": 1.323577419249259e-05, + "loss": 2.5806, + "loss_": 0.9819, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3959, + "step": 3297 + }, + { + "epoch": 0.41, + "learning_rate": 1.3208866932696639e-05, + "loss": 2.5737, + "loss_": 1.1931, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3961, + "step": 3304 + }, + { + "epoch": 0.42, + "learning_rate": 1.3181933748898629e-05, + "loss": 2.5643, + "loss_": 1.1444, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3957, + "step": 3311 + }, + { + "epoch": 0.42, + "learning_rate": 1.3154974858688121e-05, + "loss": 2.5495, + "loss_": 1.2428, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3959, + "step": 3318 + }, + { + "epoch": 0.42, + "learning_rate": 1.3127990479862333e-05, + "loss": 2.5653, + "loss_": 1.3658, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3957, + "step": 3325 + }, + { + "epoch": 0.42, + "learning_rate": 1.3100980830424419e-05, + "loss": 2.5537, + "loss_": 1.2466, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3957, + "step": 3332 + }, + { + "epoch": 0.42, + "learning_rate": 1.3073946128581685e-05, + "loss": 2.5784, + "loss_": 1.1899, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3956, + "step": 3339 + }, + { + "epoch": 0.42, + "learning_rate": 1.3046886592743828e-05, + "loss": 2.5491, + "loss_": 1.1516, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.396, + "step": 3346 + }, + { + "epoch": 0.42, + "learning_rate": 1.3019802441521181e-05, + "loss": 2.5584, + "loss_": 1.3479, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3958, + "step": 3353 + }, + { + "epoch": 0.42, + "learning_rate": 1.2992693893722939e-05, + "loss": 2.5629, + "loss_": 1.226, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3956, + "step": 3360 + }, + { + "epoch": 0.42, + "learning_rate": 1.2965561168355394e-05, + "loss": 2.5635, + "loss_": 1.2831, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3956, + "step": 3367 + }, + { + "epoch": 0.42, + "learning_rate": 1.2938404484620169e-05, + "loss": 2.5392, + "loss_": 1.3104, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3956, + "step": 3374 + }, + { + "epoch": 0.42, + "learning_rate": 1.2911224061912433e-05, + "loss": 2.5353, + "loss_": 1.0487, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3953, + "step": 3381 + }, + { + "epoch": 0.42, + "learning_rate": 1.2884020119819152e-05, + "loss": 2.5758, + "loss_": 0.7415, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3954, + "step": 3388 + }, + { + "epoch": 0.43, + "learning_rate": 1.2856792878117293e-05, + "loss": 2.56, + "loss_": 1.1296, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3954, + "step": 3395 + }, + { + "epoch": 0.43, + "learning_rate": 1.2829542556772059e-05, + "loss": 2.5564, + "loss_": 0.8006, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3953, + "step": 3402 + }, + { + "epoch": 0.43, + "learning_rate": 1.2802269375935112e-05, + "loss": 2.569, + "loss_": 1.131, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3951, + "step": 3409 + }, + { + "epoch": 0.43, + "learning_rate": 1.2774973555942796e-05, + "loss": 2.5637, + "loss_": 0.9494, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3955, + "step": 3416 + }, + { + "epoch": 0.43, + "learning_rate": 1.2747655317314344e-05, + "loss": 2.5588, + "loss_": 1.3893, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3956, + "step": 3423 + }, + { + "epoch": 0.43, + "learning_rate": 1.2720314880750118e-05, + "loss": 2.562, + "loss_": 1.1676, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3955, + "step": 3430 + }, + { + "epoch": 0.43, + "learning_rate": 1.26929524671298e-05, + "loss": 2.5562, + "loss_": 0.9704, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3951, + "step": 3437 + }, + { + "epoch": 0.43, + "learning_rate": 1.266556829751064e-05, + "loss": 2.583, + "loss_": 1.1975, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3954, + "step": 3444 + }, + { + "epoch": 0.43, + "learning_rate": 1.2638162593125634e-05, + "loss": 2.6252, + "loss_": 1.1995, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3953, + "step": 3451 + }, + { + "epoch": 0.43, + "learning_rate": 1.2610735575381763e-05, + "loss": 2.5464, + "loss_": 0.9304, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4035, + "step": 3458 + }, + { + "epoch": 0.43, + "learning_rate": 1.2583287465858197e-05, + "loss": 2.5619, + "loss_": 1.1987, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3952, + "step": 3465 + }, + { + "epoch": 0.44, + "learning_rate": 1.2555818486304497e-05, + "loss": 2.5559, + "loss_": 1.2018, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.395, + "step": 3472 + }, + { + "epoch": 0.44, + "learning_rate": 1.2528328858638844e-05, + "loss": 2.5436, + "loss_": 0.6166, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3952, + "step": 3479 + }, + { + "epoch": 0.44, + "learning_rate": 1.2500818804946211e-05, + "loss": 2.5634, + "loss_": 1.1188, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3952, + "step": 3486 + }, + { + "epoch": 0.44, + "learning_rate": 1.247328854747661e-05, + "loss": 2.5476, + "loss_": 1.1271, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3948, + "step": 3493 + }, + { + "epoch": 0.44, + "learning_rate": 1.2445738308643267e-05, + "loss": 2.5728, + "loss_": 1.0833, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3952, + "step": 3500 + }, + { + "epoch": 0.44, + "learning_rate": 1.2418168311020834e-05, + "loss": 2.5511, + "loss_": 1.2348, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3951, + "step": 3507 + }, + { + "epoch": 0.44, + "learning_rate": 1.2390578777343594e-05, + "loss": 2.5674, + "loss_": 1.3258, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3952, + "step": 3514 + }, + { + "epoch": 0.44, + "learning_rate": 1.236296993050366e-05, + "loss": 2.5809, + "loss_": 1.2076, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3947, + "step": 3521 + }, + { + "epoch": 0.44, + "learning_rate": 1.2335341993549175e-05, + "loss": 2.5583, + "loss_": 0.945, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3947, + "step": 3528 + }, + { + "epoch": 0.44, + "learning_rate": 1.2307695189682502e-05, + "loss": 2.5778, + "loss_": 1.414, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3947, + "step": 3535 + }, + { + "epoch": 0.44, + "learning_rate": 1.2280029742258435e-05, + "loss": 2.5572, + "loss_": 1.2353, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.395, + "step": 3542 + }, + { + "epoch": 0.44, + "learning_rate": 1.2252345874782376e-05, + "loss": 2.5725, + "loss_": 1.304, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3949, + "step": 3549 + }, + { + "epoch": 0.45, + "learning_rate": 1.2224643810908556e-05, + "loss": 2.5498, + "loss_": 1.2018, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.395, + "step": 3556 + }, + { + "epoch": 0.45, + "learning_rate": 1.2196923774438195e-05, + "loss": 2.534, + "loss_": 1.2461, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3947, + "step": 3563 + }, + { + "epoch": 0.45, + "learning_rate": 1.2169185989317724e-05, + "loss": 2.5985, + "loss_": 1.173, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3947, + "step": 3570 + }, + { + "epoch": 0.45, + "learning_rate": 1.2141430679636959e-05, + "loss": 2.5532, + "loss_": 1.2553, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3947, + "step": 3577 + }, + { + "epoch": 0.45, + "learning_rate": 1.211365806962729e-05, + "loss": 2.5379, + "loss_": 1.1121, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3946, + "step": 3584 + }, + { + "epoch": 0.45, + "learning_rate": 1.2085868383659882e-05, + "loss": 2.5589, + "loss_": 1.1214, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3948, + "step": 3591 + }, + { + "epoch": 0.45, + "learning_rate": 1.2058061846243847e-05, + "loss": 2.5311, + "loss_": 0.9122, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3946, + "step": 3598 + }, + { + "epoch": 0.45, + "learning_rate": 1.2030238682024444e-05, + "loss": 2.5311, + "loss_": 0.7285, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4021, + "step": 3605 + }, + { + "epoch": 0.45, + "learning_rate": 1.2002399115781253e-05, + "loss": 2.5848, + "loss_": 1.1433, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3946, + "step": 3612 + }, + { + "epoch": 0.45, + "learning_rate": 1.1974543372426363e-05, + "loss": 2.5491, + "loss_": 0.8086, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3944, + "step": 3619 + }, + { + "epoch": 0.45, + "learning_rate": 1.1946671677002563e-05, + "loss": 2.5353, + "loss_": 1.1375, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3947, + "step": 3626 + }, + { + "epoch": 0.46, + "learning_rate": 1.1918784254681506e-05, + "loss": 2.5366, + "loss_": 1.0491, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3944, + "step": 3633 + }, + { + "epoch": 0.46, + "learning_rate": 1.189088133076191e-05, + "loss": 2.5363, + "loss_": 1.254, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3945, + "step": 3640 + }, + { + "epoch": 0.46, + "learning_rate": 1.1862963130667724e-05, + "loss": 2.5588, + "loss_": 1.2051, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3943, + "step": 3647 + }, + { + "epoch": 0.46, + "learning_rate": 1.1835029879946308e-05, + "loss": 2.5656, + "loss_": 0.9809, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3943, + "step": 3654 + }, + { + "epoch": 0.46, + "learning_rate": 1.1807081804266625e-05, + "loss": 2.5597, + "loss_": 0.9086, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3946, + "step": 3661 + }, + { + "epoch": 0.46, + "learning_rate": 1.1779119129417394e-05, + "loss": 2.5387, + "loss_": 0.9881, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4026, + "step": 3668 + }, + { + "epoch": 0.46, + "learning_rate": 1.175114208130528e-05, + "loss": 2.5745, + "loss_": 1.1147, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3942, + "step": 3675 + }, + { + "epoch": 0.46, + "learning_rate": 1.1723150885953081e-05, + "loss": 2.5314, + "loss_": 1.0994, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4017, + "step": 3682 + }, + { + "epoch": 0.46, + "learning_rate": 1.1695145769497871e-05, + "loss": 2.5591, + "loss_": 1.1917, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3943, + "step": 3689 + }, + { + "epoch": 0.46, + "learning_rate": 1.1667126958189203e-05, + "loss": 2.5559, + "loss_": 1.122, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3944, + "step": 3696 + }, + { + "epoch": 0.46, + "learning_rate": 1.1639094678387268e-05, + "loss": 2.5315, + "loss_": 1.3327, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3941, + "step": 3703 + }, + { + "epoch": 0.47, + "learning_rate": 1.1611049156561055e-05, + "loss": 2.5537, + "loss_": 1.1127, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.394, + "step": 3710 + }, + { + "epoch": 0.47, + "learning_rate": 1.1582990619286555e-05, + "loss": 2.5667, + "loss_": 1.1163, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3942, + "step": 3717 + }, + { + "epoch": 0.47, + "learning_rate": 1.1554919293244885e-05, + "loss": 2.5407, + "loss_": 1.3315, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3944, + "step": 3724 + }, + { + "epoch": 0.47, + "learning_rate": 1.1526835405220503e-05, + "loss": 2.5515, + "loss_": 0.9382, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3944, + "step": 3731 + }, + { + "epoch": 0.47, + "learning_rate": 1.1498739182099335e-05, + "loss": 2.5467, + "loss_": 0.94, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3941, + "step": 3738 + }, + { + "epoch": 0.47, + "learning_rate": 1.1470630850866966e-05, + "loss": 2.5174, + "loss_": 1.0814, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4014, + "step": 3745 + }, + { + "epoch": 0.47, + "learning_rate": 1.1442510638606813e-05, + "loss": 2.5437, + "loss_": 1.1433, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3939, + "step": 3752 + }, + { + "epoch": 0.47, + "learning_rate": 1.141437877249826e-05, + "loss": 2.561, + "loss_": 1.0612, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3941, + "step": 3759 + }, + { + "epoch": 0.47, + "learning_rate": 1.1386235479814856e-05, + "loss": 2.5184, + "loss_": 1.009, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.394, + "step": 3766 + }, + { + "epoch": 0.47, + "learning_rate": 1.1358080987922452e-05, + "loss": 2.5491, + "loss_": 1.0159, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.394, + "step": 3773 + }, + { + "epoch": 0.47, + "learning_rate": 1.1329915524277384e-05, + "loss": 2.5448, + "loss_": 1.2692, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3943, + "step": 3780 + }, + { + "epoch": 0.47, + "learning_rate": 1.1301739316424623e-05, + "loss": 2.5624, + "loss_": 1.1014, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3936, + "step": 3787 + }, + { + "epoch": 0.48, + "learning_rate": 1.1273552591995943e-05, + "loss": 2.5418, + "loss_": 1.3636, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3938, + "step": 3794 + }, + { + "epoch": 0.48, + "learning_rate": 1.124535557870808e-05, + "loss": 2.5431, + "loss_": 1.0197, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3938, + "step": 3801 + }, + { + "epoch": 0.48, + "learning_rate": 1.1217148504360885e-05, + "loss": 2.5773, + "loss_": 1.4614, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3941, + "step": 3808 + }, + { + "epoch": 0.48, + "learning_rate": 1.1188931596835509e-05, + "loss": 2.562, + "loss_": 0.9753, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3934, + "step": 3815 + }, + { + "epoch": 0.48, + "learning_rate": 1.1160705084092526e-05, + "loss": 2.5293, + "loss_": 1.1196, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3936, + "step": 3822 + }, + { + "epoch": 0.48, + "learning_rate": 1.1132469194170117e-05, + "loss": 2.5569, + "loss_": 1.3226, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3939, + "step": 3829 + }, + { + "epoch": 0.48, + "learning_rate": 1.1104224155182215e-05, + "loss": 2.5512, + "loss_": 1.1311, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3939, + "step": 3836 + }, + { + "epoch": 0.48, + "learning_rate": 1.1075970195316677e-05, + "loss": 2.5509, + "loss_": 1.1204, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3938, + "step": 3843 + }, + { + "epoch": 0.48, + "learning_rate": 1.104770754283342e-05, + "loss": 2.5397, + "loss_": 1.1743, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3936, + "step": 3850 + }, + { + "epoch": 0.48, + "learning_rate": 1.101943642606259e-05, + "loss": 2.5674, + "loss_": 1.3249, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3936, + "step": 3857 + }, + { + "epoch": 0.48, + "learning_rate": 1.0991157073402723e-05, + "loss": 2.5645, + "loss_": 0.9719, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3936, + "step": 3864 + }, + { + "epoch": 0.49, + "learning_rate": 1.096286971331888e-05, + "loss": 2.5283, + "loss_": 1.0872, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3938, + "step": 3871 + }, + { + "epoch": 0.49, + "learning_rate": 1.0934574574340821e-05, + "loss": 2.5598, + "loss_": 1.186, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3934, + "step": 3878 + }, + { + "epoch": 0.49, + "learning_rate": 1.0906271885061149e-05, + "loss": 2.5509, + "loss_": 1.0989, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3936, + "step": 3885 + }, + { + "epoch": 0.49, + "learning_rate": 1.0877961874133458e-05, + "loss": 2.5613, + "loss_": 1.1127, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3937, + "step": 3892 + }, + { + "epoch": 0.49, + "learning_rate": 1.0849644770270502e-05, + "loss": 2.5592, + "loss_": 1.1666, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3934, + "step": 3899 + }, + { + "epoch": 0.49, + "learning_rate": 1.0821320802242335e-05, + "loss": 2.5586, + "loss_": 1.2445, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3936, + "step": 3906 + }, + { + "epoch": 0.49, + "learning_rate": 1.0792990198874462e-05, + "loss": 2.5441, + "loss_": 0.8492, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3937, + "step": 3913 + }, + { + "epoch": 0.49, + "learning_rate": 1.0764653189046002e-05, + "loss": 2.5834, + "loss_": 1.3096, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3932, + "step": 3920 + }, + { + "epoch": 0.49, + "learning_rate": 1.073631000168782e-05, + "loss": 2.5308, + "loss_": 1.0803, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3936, + "step": 3927 + }, + { + "epoch": 0.49, + "learning_rate": 1.0707960865780697e-05, + "loss": 2.5575, + "loss_": 1.2032, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3936, + "step": 3934 + }, + { + "epoch": 0.49, + "learning_rate": 1.0679606010353467e-05, + "loss": 2.5341, + "loss_": 1.1694, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3932, + "step": 3941 + }, + { + "epoch": 0.49, + "learning_rate": 1.0651245664481176e-05, + "loss": 2.5644, + "loss_": 0.8158, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4, + "step": 3948 + }, + { + "epoch": 0.5, + "learning_rate": 1.062288005728322e-05, + "loss": 2.5545, + "loss_": 0.8523, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.3993, + "step": 3955 + }, + { + "epoch": 0.5, + "learning_rate": 1.0594509417921505e-05, + "loss": 2.5287, + "loss_": 1.0746, + "moe_loss": 0.1604, + "moe_loss_longrong": 1.4, + "step": 3962 + }, + { + "epoch": 0.5, + "learning_rate": 1.0566133975598592e-05, + "loss": 2.5596, + "loss_": 0.8816, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.393, + "step": 3969 + }, + { + "epoch": 0.5, + "learning_rate": 1.0537753959555844e-05, + "loss": 2.5464, + "loss_": 1.315, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3936, + "step": 3976 + }, + { + "epoch": 0.5, + "learning_rate": 1.0509369599071563e-05, + "loss": 2.5213, + "loss_": 1.2024, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.4002, + "step": 3983 + }, + { + "epoch": 0.5, + "learning_rate": 1.0480981123459175e-05, + "loss": 2.5931, + "loss_": 1.2043, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3931, + "step": 3990 + }, + { + "epoch": 0.5, + "learning_rate": 1.0452588762065323e-05, + "loss": 2.5559, + "loss_": 1.1557, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3932, + "step": 3997 + }, + { + "epoch": 0.5, + "learning_rate": 1.0424192744268063e-05, + "loss": 2.5297, + "loss_": 1.2939, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3929, + "step": 4004 + }, + { + "epoch": 0.5, + "learning_rate": 1.0395793299474979e-05, + "loss": 2.5487, + "loss_": 0.9489, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.393, + "step": 4011 + }, + { + "epoch": 0.5, + "learning_rate": 1.0367390657121346e-05, + "loss": 2.5221, + "loss_": 1.0307, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3933, + "step": 4018 + }, + { + "epoch": 0.5, + "learning_rate": 1.033898504666827e-05, + "loss": 2.5383, + "loss_": 1.0582, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.393, + "step": 4025 + }, + { + "epoch": 0.51, + "learning_rate": 1.031057669760084e-05, + "loss": 2.5864, + "loss_": 1.1518, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3933, + "step": 4032 + }, + { + "epoch": 0.51, + "learning_rate": 1.0282165839426268e-05, + "loss": 2.5311, + "loss_": 1.1738, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3931, + "step": 4039 + }, + { + "epoch": 0.51, + "learning_rate": 1.0253752701672033e-05, + "loss": 2.5819, + "loss_": 1.1982, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.393, + "step": 4046 + }, + { + "epoch": 0.51, + "learning_rate": 1.022533751388403e-05, + "loss": 2.5437, + "loss_": 1.288, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3929, + "step": 4053 + }, + { + "epoch": 0.51, + "learning_rate": 1.0196920505624726e-05, + "loss": 2.5305, + "loss_": 1.1663, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3928, + "step": 4060 + }, + { + "epoch": 0.51, + "learning_rate": 1.0168501906471284e-05, + "loss": 2.5629, + "loss_": 1.1848, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.393, + "step": 4067 + }, + { + "epoch": 0.51, + "learning_rate": 1.014008194601372e-05, + "loss": 2.5355, + "loss_": 1.0939, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.393, + "step": 4074 + }, + { + "epoch": 0.51, + "learning_rate": 1.0111660853853056e-05, + "loss": 2.5297, + "loss_": 0.9667, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.393, + "step": 4081 + }, + { + "epoch": 0.51, + "learning_rate": 1.0083238859599453e-05, + "loss": 2.5437, + "loss_": 0.7903, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.3992, + "step": 4088 + }, + { + "epoch": 0.51, + "learning_rate": 1.005481619287036e-05, + "loss": 2.5958, + "loss_": 1.1979, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3927, + "step": 4095 + }, + { + "epoch": 0.51, + "learning_rate": 1.0026393083288659e-05, + "loss": 2.5676, + "loss_": 1.1374, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3926, + "step": 4102 + }, + { + "epoch": 0.52, + "learning_rate": 9.997969760480802e-06, + "loss": 2.5415, + "loss_": 1.132, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3929, + "step": 4109 + }, + { + "epoch": 0.52, + "learning_rate": 9.969546454074977e-06, + "loss": 2.5337, + "loss_": 1.0972, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3929, + "step": 4116 + }, + { + "epoch": 0.52, + "learning_rate": 9.941123393699235e-06, + "loss": 2.5709, + "loss_": 1.0257, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3931, + "step": 4123 + }, + { + "epoch": 0.52, + "learning_rate": 9.912700808979632e-06, + "loss": 2.5593, + "loss_": 1.1574, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3928, + "step": 4130 + }, + { + "epoch": 0.52, + "learning_rate": 9.884278929538387e-06, + "loss": 2.5532, + "loss_": 1.0852, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3928, + "step": 4137 + }, + { + "epoch": 0.52, + "learning_rate": 9.855857984992026e-06, + "loss": 2.544, + "loss_": 1.2876, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3928, + "step": 4144 + }, + { + "epoch": 0.52, + "learning_rate": 9.82743820494951e-06, + "loss": 2.5251, + "loss_": 1.0657, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3925, + "step": 4151 + }, + { + "epoch": 0.52, + "learning_rate": 9.799019819010405e-06, + "loss": 2.5677, + "loss_": 1.0093, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3927, + "step": 4158 + }, + { + "epoch": 0.52, + "learning_rate": 9.770603056763009e-06, + "loss": 2.5323, + "loss_": 1.0455, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3927, + "step": 4165 + }, + { + "epoch": 0.52, + "learning_rate": 9.742188147782494e-06, + "loss": 2.5443, + "loss_": 1.2455, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3927, + "step": 4172 + }, + { + "epoch": 0.52, + "learning_rate": 9.713775321629073e-06, + "loss": 2.5462, + "loss_": 0.9975, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3928, + "step": 4179 + }, + { + "epoch": 0.52, + "learning_rate": 9.685364807846127e-06, + "loss": 2.5703, + "loss_": 1.3459, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3925, + "step": 4186 + }, + { + "epoch": 0.53, + "learning_rate": 9.656956835958356e-06, + "loss": 2.5182, + "loss_": 1.0759, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3927, + "step": 4193 + }, + { + "epoch": 0.53, + "learning_rate": 9.628551635469918e-06, + "loss": 2.566, + "loss_": 1.248, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3924, + "step": 4200 + }, + { + "epoch": 0.53, + "learning_rate": 9.600149435862593e-06, + "loss": 2.5279, + "loss_": 1.1282, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3925, + "step": 4207 + }, + { + "epoch": 0.53, + "learning_rate": 9.571750466593912e-06, + "loss": 2.5369, + "loss_": 0.8771, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3924, + "step": 4214 + }, + { + "epoch": 0.53, + "learning_rate": 9.543354957095299e-06, + "loss": 2.5527, + "loss_": 1.0412, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3926, + "step": 4221 + }, + { + "epoch": 0.53, + "learning_rate": 9.514963136770242e-06, + "loss": 2.5574, + "loss_": 0.977, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3923, + "step": 4228 + }, + { + "epoch": 0.53, + "learning_rate": 9.486575234992423e-06, + "loss": 2.5446, + "loss_": 1.2368, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3924, + "step": 4235 + }, + { + "epoch": 0.53, + "learning_rate": 9.45819148110385e-06, + "loss": 2.5138, + "loss_": 1.0194, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3925, + "step": 4242 + }, + { + "epoch": 0.53, + "learning_rate": 9.429812104413042e-06, + "loss": 2.5362, + "loss_": 0.59, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3924, + "step": 4249 + }, + { + "epoch": 0.53, + "learning_rate": 9.401437334193143e-06, + "loss": 2.5026, + "loss_": 1.1442, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3923, + "step": 4256 + }, + { + "epoch": 0.53, + "learning_rate": 9.373067399680084e-06, + "loss": 2.4977, + "loss_": 1.0847, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3925, + "step": 4263 + }, + { + "epoch": 0.54, + "learning_rate": 9.344702530070729e-06, + "loss": 2.5063, + "loss_": 1.2399, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3926, + "step": 4270 + }, + { + "epoch": 0.54, + "learning_rate": 9.316342954521028e-06, + "loss": 2.5195, + "loss_": 1.0512, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.392, + "step": 4277 + }, + { + "epoch": 0.54, + "learning_rate": 9.287988902144157e-06, + "loss": 2.5811, + "loss_": 1.2023, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3926, + "step": 4284 + }, + { + "epoch": 0.54, + "learning_rate": 9.259640602008667e-06, + "loss": 2.5491, + "loss_": 0.9779, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3922, + "step": 4291 + }, + { + "epoch": 0.54, + "learning_rate": 9.231298283136641e-06, + "loss": 2.5653, + "loss_": 1.156, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3923, + "step": 4298 + }, + { + "epoch": 0.54, + "learning_rate": 9.202962174501848e-06, + "loss": 2.5369, + "loss_": 1.1885, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3924, + "step": 4305 + }, + { + "epoch": 0.54, + "learning_rate": 9.17463250502787e-06, + "loss": 2.5003, + "loss_": 1.1876, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.392, + "step": 4312 + }, + { + "epoch": 0.54, + "learning_rate": 9.146309503586282e-06, + "loss": 2.5501, + "loss_": 1.0059, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.3976, + "step": 4319 + }, + { + "epoch": 0.54, + "learning_rate": 9.117993398994784e-06, + "loss": 2.5294, + "loss_": 1.2092, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.392, + "step": 4326 + }, + { + "epoch": 0.54, + "learning_rate": 9.089684420015346e-06, + "loss": 2.5311, + "loss_": 1.092, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3922, + "step": 4333 + }, + { + "epoch": 0.54, + "learning_rate": 9.06138279535239e-06, + "loss": 2.5313, + "loss_": 1.3094, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.392, + "step": 4340 + }, + { + "epoch": 0.54, + "learning_rate": 9.033088753650918e-06, + "loss": 2.5423, + "loss_": 0.8346, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3921, + "step": 4347 + }, + { + "epoch": 0.55, + "learning_rate": 9.004802523494655e-06, + "loss": 2.5419, + "loss_": 0.9882, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3922, + "step": 4354 + }, + { + "epoch": 0.55, + "learning_rate": 8.976524333404238e-06, + "loss": 2.5791, + "loss_": 1.1859, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3921, + "step": 4361 + }, + { + "epoch": 0.55, + "learning_rate": 8.94825441183534e-06, + "loss": 2.5575, + "loss_": 1.104, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3921, + "step": 4368 + }, + { + "epoch": 0.55, + "learning_rate": 8.919992987176836e-06, + "loss": 2.5297, + "loss_": 1.0182, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3922, + "step": 4375 + }, + { + "epoch": 0.55, + "learning_rate": 8.891740287748952e-06, + "loss": 2.5778, + "loss_": 1.0814, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3922, + "step": 4382 + }, + { + "epoch": 0.55, + "learning_rate": 8.863496541801424e-06, + "loss": 2.5495, + "loss_": 1.171, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.392, + "step": 4389 + }, + { + "epoch": 0.55, + "learning_rate": 8.835261977511666e-06, + "loss": 2.5199, + "loss_": 1.1374, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.392, + "step": 4396 + }, + { + "epoch": 0.55, + "learning_rate": 8.807036822982892e-06, + "loss": 2.5528, + "loss_": 1.2331, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3919, + "step": 4403 + }, + { + "epoch": 0.55, + "learning_rate": 8.778821306242318e-06, + "loss": 2.5504, + "loss_": 1.3552, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3921, + "step": 4410 + }, + { + "epoch": 0.55, + "learning_rate": 8.750615655239287e-06, + "loss": 2.5511, + "loss_": 1.2613, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3918, + "step": 4417 + }, + { + "epoch": 0.55, + "learning_rate": 8.722420097843437e-06, + "loss": 2.5019, + "loss_": 0.9546, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3919, + "step": 4424 + }, + { + "epoch": 0.56, + "learning_rate": 8.694234861842865e-06, + "loss": 2.5351, + "loss_": 1.1331, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3919, + "step": 4431 + }, + { + "epoch": 0.56, + "learning_rate": 8.66606017494228e-06, + "loss": 2.5412, + "loss_": 1.2775, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3921, + "step": 4438 + }, + { + "epoch": 0.56, + "learning_rate": 8.637896264761176e-06, + "loss": 2.4963, + "loss_": 0.7466, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.3971, + "step": 4445 + }, + { + "epoch": 0.56, + "learning_rate": 8.609743358831965e-06, + "loss": 2.5192, + "loss_": 1.0139, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3918, + "step": 4452 + }, + { + "epoch": 0.56, + "learning_rate": 8.58160168459817e-06, + "loss": 2.5407, + "loss_": 1.0999, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3918, + "step": 4459 + }, + { + "epoch": 0.56, + "learning_rate": 8.553471469412577e-06, + "loss": 2.5692, + "loss_": 1.1933, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3917, + "step": 4466 + }, + { + "epoch": 0.56, + "learning_rate": 8.525352940535381e-06, + "loss": 2.5063, + "loss_": 0.9695, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.392, + "step": 4473 + }, + { + "epoch": 0.56, + "learning_rate": 8.497246325132382e-06, + "loss": 2.5941, + "loss_": 1.288, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3919, + "step": 4480 + }, + { + "epoch": 0.56, + "learning_rate": 8.469151850273124e-06, + "loss": 2.5543, + "loss_": 1.1849, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.392, + "step": 4487 + }, + { + "epoch": 0.56, + "learning_rate": 8.441069742929069e-06, + "loss": 2.5168, + "loss_": 1.1032, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3917, + "step": 4494 + }, + { + "epoch": 0.56, + "learning_rate": 8.413000229971765e-06, + "loss": 2.5085, + "loss_": 1.1973, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3917, + "step": 4501 + }, + { + "epoch": 0.57, + "learning_rate": 8.384943538171017e-06, + "loss": 2.5283, + "loss_": 1.2194, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3918, + "step": 4508 + }, + { + "epoch": 0.57, + "learning_rate": 8.356899894193038e-06, + "loss": 2.5342, + "loss_": 1.176, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3917, + "step": 4515 + }, + { + "epoch": 0.57, + "learning_rate": 8.328869524598635e-06, + "loss": 2.5705, + "loss_": 1.1662, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3916, + "step": 4522 + }, + { + "epoch": 0.57, + "learning_rate": 8.300852655841378e-06, + "loss": 2.5423, + "loss_": 1.0822, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3916, + "step": 4529 + }, + { + "epoch": 0.57, + "learning_rate": 8.272849514265763e-06, + "loss": 2.5792, + "loss_": 1.1001, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3915, + "step": 4536 + }, + { + "epoch": 0.57, + "learning_rate": 8.244860326105378e-06, + "loss": 2.539, + "loss_": 1.3069, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3916, + "step": 4543 + }, + { + "epoch": 0.57, + "learning_rate": 8.216885317481091e-06, + "loss": 2.5393, + "loss_": 1.0325, + "moe_loss": 0.1603, + "moe_loss_longrong": 1.3971, + "step": 4550 + }, + { + "epoch": 0.57, + "learning_rate": 8.188924714399222e-06, + "loss": 2.5338, + "loss_": 1.0938, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3919, + "step": 4557 + }, + { + "epoch": 0.57, + "learning_rate": 8.160978742749692e-06, + "loss": 2.5578, + "loss_": 0.9998, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3916, + "step": 4564 + }, + { + "epoch": 0.57, + "learning_rate": 8.133047628304229e-06, + "loss": 2.5287, + "loss_": 1.1445, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3915, + "step": 4571 + }, + { + "epoch": 0.57, + "learning_rate": 8.105131596714538e-06, + "loss": 2.5354, + "loss_": 1.0362, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3915, + "step": 4578 + }, + { + "epoch": 0.57, + "learning_rate": 8.077230873510452e-06, + "loss": 2.548, + "loss_": 1.0803, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3916, + "step": 4585 + }, + { + "epoch": 0.58, + "learning_rate": 8.049345684098148e-06, + "loss": 2.5192, + "loss_": 1.1937, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3916, + "step": 4592 + }, + { + "epoch": 0.58, + "learning_rate": 8.021476253758303e-06, + "loss": 2.5454, + "loss_": 1.2712, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3914, + "step": 4599 + }, + { + "epoch": 0.58, + "learning_rate": 7.99362280764427e-06, + "loss": 2.5142, + "loss_": 1.3818, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3918, + "step": 4606 + }, + { + "epoch": 0.58, + "learning_rate": 7.965785570780275e-06, + "loss": 2.5291, + "loss_": 1.2159, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3915, + "step": 4613 + }, + { + "epoch": 0.58, + "learning_rate": 7.937964768059592e-06, + "loss": 2.5456, + "loss_": 1.0392, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3915, + "step": 4620 + }, + { + "epoch": 0.58, + "learning_rate": 7.91016062424273e-06, + "loss": 2.541, + "loss_": 1.2045, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3915, + "step": 4627 + }, + { + "epoch": 0.58, + "learning_rate": 7.882373363955597e-06, + "loss": 2.5365, + "loss_": 1.1843, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3914, + "step": 4634 + }, + { + "epoch": 0.58, + "learning_rate": 7.854603211687715e-06, + "loss": 2.5216, + "loss_": 1.0943, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3914, + "step": 4641 + }, + { + "epoch": 0.58, + "learning_rate": 7.826850391790393e-06, + "loss": 2.4891, + "loss_": 0.9685, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3913, + "step": 4648 + }, + { + "epoch": 0.58, + "learning_rate": 7.799115128474907e-06, + "loss": 2.5239, + "loss_": 1.093, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3914, + "step": 4655 + }, + { + "epoch": 0.58, + "learning_rate": 7.771397645810699e-06, + "loss": 2.5494, + "loss_": 1.4255, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3915, + "step": 4662 + }, + { + "epoch": 0.59, + "learning_rate": 7.743698167723568e-06, + "loss": 2.5264, + "loss_": 1.3261, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3915, + "step": 4669 + }, + { + "epoch": 0.59, + "learning_rate": 7.716016917993843e-06, + "loss": 2.5483, + "loss_": 1.047, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3912, + "step": 4676 + }, + { + "epoch": 0.59, + "learning_rate": 7.688354120254606e-06, + "loss": 2.5823, + "loss_": 1.3127, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3914, + "step": 4683 + }, + { + "epoch": 0.59, + "learning_rate": 7.660709997989855e-06, + "loss": 2.6013, + "loss_": 1.2204, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3914, + "step": 4690 + }, + { + "epoch": 0.59, + "learning_rate": 7.633084774532717e-06, + "loss": 2.5238, + "loss_": 0.9103, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3915, + "step": 4697 + }, + { + "epoch": 0.59, + "learning_rate": 7.605478673063635e-06, + "loss": 2.5269, + "loss_": 1.2055, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3914, + "step": 4704 + }, + { + "epoch": 0.59, + "learning_rate": 7.577891916608574e-06, + "loss": 2.5042, + "loss_": 1.0835, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3913, + "step": 4711 + }, + { + "epoch": 0.59, + "learning_rate": 7.5503247280372104e-06, + "loss": 2.5373, + "loss_": 0.7241, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3914, + "step": 4718 + }, + { + "epoch": 0.59, + "learning_rate": 7.522777330061126e-06, + "loss": 2.562, + "loss_": 1.3803, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3916, + "step": 4725 + }, + { + "epoch": 0.59, + "learning_rate": 7.495249945232028e-06, + "loss": 2.5934, + "loss_": 0.7858, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3914, + "step": 4732 + }, + { + "epoch": 0.59, + "learning_rate": 7.467742795939941e-06, + "loss": 2.5437, + "loss_": 1.304, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3913, + "step": 4739 + }, + { + "epoch": 0.59, + "learning_rate": 7.440256104411394e-06, + "loss": 2.5461, + "loss_": 1.188, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3913, + "step": 4746 + }, + { + "epoch": 0.6, + "learning_rate": 7.4127900927076575e-06, + "loss": 2.538, + "loss_": 1.0811, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3912, + "step": 4753 + }, + { + "epoch": 0.6, + "learning_rate": 7.385344982722928e-06, + "loss": 2.5834, + "loss_": 1.0429, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3911, + "step": 4760 + }, + { + "epoch": 0.6, + "learning_rate": 7.3579209961825346e-06, + "loss": 2.5379, + "loss_": 1.0402, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3913, + "step": 4767 + }, + { + "epoch": 0.6, + "learning_rate": 7.330518354641156e-06, + "loss": 2.5444, + "loss_": 0.9376, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3912, + "step": 4774 + }, + { + "epoch": 0.6, + "learning_rate": 7.303137279481034e-06, + "loss": 2.507, + "loss_": 1.0915, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3914, + "step": 4781 + }, + { + "epoch": 0.6, + "learning_rate": 7.275777991910164e-06, + "loss": 2.4976, + "loss_": 1.0431, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3912, + "step": 4788 + }, + { + "epoch": 0.6, + "learning_rate": 7.248440712960535e-06, + "loss": 2.5333, + "loss_": 1.1329, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3911, + "step": 4795 + }, + { + "epoch": 0.6, + "learning_rate": 7.2211256634863255e-06, + "loss": 2.5446, + "loss_": 1.0213, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3914, + "step": 4802 + }, + { + "epoch": 0.6, + "learning_rate": 7.1938330641621316e-06, + "loss": 2.5385, + "loss_": 1.0947, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.391, + "step": 4809 + }, + { + "epoch": 0.6, + "learning_rate": 7.166563135481166e-06, + "loss": 2.5381, + "loss_": 1.0597, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3911, + "step": 4816 + }, + { + "epoch": 0.6, + "learning_rate": 7.139316097753499e-06, + "loss": 2.5394, + "loss_": 1.2727, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3911, + "step": 4823 + }, + { + "epoch": 0.61, + "learning_rate": 7.112092171104268e-06, + "loss": 2.5323, + "loss_": 0.9192, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.391, + "step": 4830 + }, + { + "epoch": 0.61, + "learning_rate": 7.084891575471885e-06, + "loss": 2.5276, + "loss_": 1.0647, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3913, + "step": 4837 + }, + { + "epoch": 0.61, + "learning_rate": 7.05771453060629e-06, + "loss": 2.5388, + "loss_": 1.02, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3912, + "step": 4844 + }, + { + "epoch": 0.61, + "learning_rate": 7.030561256067159e-06, + "loss": 2.5259, + "loss_": 1.1344, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.391, + "step": 4851 + }, + { + "epoch": 0.61, + "learning_rate": 7.003431971222115e-06, + "loss": 2.5743, + "loss_": 1.1341, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3913, + "step": 4858 + }, + { + "epoch": 0.61, + "learning_rate": 6.976326895244987e-06, + "loss": 2.5281, + "loss_": 1.0979, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3909, + "step": 4865 + }, + { + "epoch": 0.61, + "learning_rate": 6.949246247114019e-06, + "loss": 2.5364, + "loss_": 1.1133, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.391, + "step": 4872 + }, + { + "epoch": 0.61, + "learning_rate": 6.922190245610106e-06, + "loss": 2.4967, + "loss_": 1.1036, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3909, + "step": 4879 + }, + { + "epoch": 0.61, + "learning_rate": 6.895159109315022e-06, + "loss": 2.525, + "loss_": 1.0468, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3951, + "step": 4886 + }, + { + "epoch": 0.61, + "learning_rate": 6.868153056609665e-06, + "loss": 2.5504, + "loss_": 1.0107, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3912, + "step": 4893 + }, + { + "epoch": 0.61, + "learning_rate": 6.841172305672289e-06, + "loss": 2.5496, + "loss_": 1.1926, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3909, + "step": 4900 + }, + { + "epoch": 0.62, + "learning_rate": 6.814217074476721e-06, + "loss": 2.5328, + "loss_": 1.2163, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3909, + "step": 4907 + }, + { + "epoch": 0.62, + "learning_rate": 6.787287580790634e-06, + "loss": 2.5072, + "loss_": 0.9685, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.391, + "step": 4914 + }, + { + "epoch": 0.62, + "learning_rate": 6.760384042173769e-06, + "loss": 2.5442, + "loss_": 0.9827, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.391, + "step": 4921 + }, + { + "epoch": 0.62, + "learning_rate": 6.733506675976171e-06, + "loss": 2.5335, + "loss_": 0.9915, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3911, + "step": 4928 + }, + { + "epoch": 0.62, + "learning_rate": 6.7066556993364525e-06, + "loss": 2.5435, + "loss_": 1.1737, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3908, + "step": 4935 + }, + { + "epoch": 0.62, + "learning_rate": 6.679831329180025e-06, + "loss": 2.5157, + "loss_": 0.8595, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.391, + "step": 4942 + }, + { + "epoch": 0.62, + "learning_rate": 6.653033782217337e-06, + "loss": 2.518, + "loss_": 1.1064, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3908, + "step": 4949 + }, + { + "epoch": 0.62, + "learning_rate": 6.626263274942157e-06, + "loss": 2.5186, + "loss_": 1.2309, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3908, + "step": 4956 + }, + { + "epoch": 0.62, + "learning_rate": 6.599520023629789e-06, + "loss": 2.531, + "loss_": 1.2354, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.391, + "step": 4963 + }, + { + "epoch": 0.62, + "learning_rate": 6.572804244335349e-06, + "loss": 2.5529, + "loss_": 1.056, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3909, + "step": 4970 + }, + { + "epoch": 0.62, + "learning_rate": 6.546116152891998e-06, + "loss": 2.5579, + "loss_": 1.0829, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.391, + "step": 4977 + }, + { + "epoch": 0.62, + "learning_rate": 6.519455964909223e-06, + "loss": 2.5493, + "loss_": 1.1124, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3908, + "step": 4984 + }, + { + "epoch": 0.63, + "learning_rate": 6.492823895771077e-06, + "loss": 2.5263, + "loss_": 0.9684, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.395, + "step": 4991 + }, + { + "epoch": 0.63, + "learning_rate": 6.466220160634444e-06, + "loss": 2.5259, + "loss_": 1.2248, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.391, + "step": 4998 + }, + { + "epoch": 0.63, + "learning_rate": 6.439644974427304e-06, + "loss": 2.5509, + "loss_": 0.9338, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.391, + "step": 5005 + }, + { + "epoch": 0.63, + "learning_rate": 6.4130985518469965e-06, + "loss": 2.5509, + "loss_": 1.1724, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3907, + "step": 5012 + }, + { + "epoch": 0.63, + "learning_rate": 6.386581107358473e-06, + "loss": 2.5498, + "loss_": 1.2157, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.391, + "step": 5019 + }, + { + "epoch": 0.63, + "learning_rate": 6.360092855192586e-06, + "loss": 2.5417, + "loss_": 1.0377, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3907, + "step": 5026 + }, + { + "epoch": 0.63, + "learning_rate": 6.3336340093443424e-06, + "loss": 2.527, + "loss_": 1.2523, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3908, + "step": 5033 + }, + { + "epoch": 0.63, + "learning_rate": 6.307204783571179e-06, + "loss": 2.523, + "loss_": 0.7425, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3908, + "step": 5040 + }, + { + "epoch": 0.63, + "learning_rate": 6.280805391391238e-06, + "loss": 2.5512, + "loss_": 1.2265, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3906, + "step": 5047 + }, + { + "epoch": 0.63, + "learning_rate": 6.254436046081641e-06, + "loss": 2.5116, + "loss_": 1.2887, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3908, + "step": 5054 + }, + { + "epoch": 0.63, + "learning_rate": 6.228096960676764e-06, + "loss": 2.5107, + "loss_": 0.8449, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3907, + "step": 5061 + }, + { + "epoch": 0.64, + "learning_rate": 6.201788347966511e-06, + "loss": 2.5181, + "loss_": 1.1947, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3908, + "step": 5068 + }, + { + "epoch": 0.64, + "learning_rate": 6.175510420494609e-06, + "loss": 2.5637, + "loss_": 1.3614, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3909, + "step": 5075 + }, + { + "epoch": 0.64, + "learning_rate": 6.149263390556887e-06, + "loss": 2.5147, + "loss_": 1.0484, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3906, + "step": 5082 + }, + { + "epoch": 0.64, + "learning_rate": 6.123047470199539e-06, + "loss": 2.5507, + "loss_": 1.3089, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3906, + "step": 5089 + }, + { + "epoch": 0.64, + "learning_rate": 6.096862871217448e-06, + "loss": 2.5643, + "loss_": 1.0995, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3907, + "step": 5096 + }, + { + "epoch": 0.64, + "learning_rate": 6.070709805152451e-06, + "loss": 2.5202, + "loss_": 1.3114, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3907, + "step": 5103 + }, + { + "epoch": 0.64, + "learning_rate": 6.044588483291625e-06, + "loss": 2.5343, + "loss_": 1.2697, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3908, + "step": 5110 + }, + { + "epoch": 0.64, + "learning_rate": 6.018499116665603e-06, + "loss": 2.5169, + "loss_": 1.2687, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3906, + "step": 5117 + }, + { + "epoch": 0.64, + "learning_rate": 5.9924419160468515e-06, + "loss": 2.5049, + "loss_": 0.9986, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3907, + "step": 5124 + }, + { + "epoch": 0.64, + "learning_rate": 5.966417091947965e-06, + "loss": 2.5498, + "loss_": 1.1027, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3907, + "step": 5131 + }, + { + "epoch": 0.64, + "learning_rate": 5.9404248546199795e-06, + "loss": 2.5273, + "loss_": 1.1325, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3906, + "step": 5138 + }, + { + "epoch": 0.64, + "learning_rate": 5.914465414050669e-06, + "loss": 2.5246, + "loss_": 1.1098, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3906, + "step": 5145 + }, + { + "epoch": 0.65, + "learning_rate": 5.888538979962843e-06, + "loss": 2.5145, + "loss_": 1.1524, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3905, + "step": 5152 + }, + { + "epoch": 0.65, + "learning_rate": 5.862645761812655e-06, + "loss": 2.5404, + "loss_": 1.3356, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3907, + "step": 5159 + }, + { + "epoch": 0.65, + "learning_rate": 5.836785968787915e-06, + "loss": 2.5027, + "loss_": 1.0651, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3907, + "step": 5166 + }, + { + "epoch": 0.65, + "learning_rate": 5.810959809806396e-06, + "loss": 2.5426, + "loss_": 1.0368, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3904, + "step": 5173 + }, + { + "epoch": 0.65, + "learning_rate": 5.785167493514137e-06, + "loss": 2.5547, + "loss_": 1.1137, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3908, + "step": 5180 + }, + { + "epoch": 0.65, + "learning_rate": 5.759409228283779e-06, + "loss": 2.5616, + "loss_": 1.0141, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3904, + "step": 5187 + }, + { + "epoch": 0.65, + "learning_rate": 5.733685222212868e-06, + "loss": 2.5659, + "loss_": 1.0579, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3906, + "step": 5194 + }, + { + "epoch": 0.65, + "learning_rate": 5.7079956831221616e-06, + "loss": 2.5385, + "loss_": 1.1832, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3906, + "step": 5201 + }, + { + "epoch": 0.65, + "learning_rate": 5.682340818553978e-06, + "loss": 2.5505, + "loss_": 0.9514, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3907, + "step": 5208 + }, + { + "epoch": 0.65, + "learning_rate": 5.656720835770499e-06, + "loss": 2.5296, + "loss_": 1.1111, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3907, + "step": 5215 + }, + { + "epoch": 0.65, + "learning_rate": 5.6311359417520975e-06, + "loss": 2.556, + "loss_": 1.1038, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3906, + "step": 5222 + }, + { + "epoch": 0.66, + "learning_rate": 5.605586343195676e-06, + "loss": 2.5203, + "loss_": 1.0794, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3905, + "step": 5229 + }, + { + "epoch": 0.66, + "learning_rate": 5.580072246512984e-06, + "loss": 2.531, + "loss_": 1.1714, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3905, + "step": 5236 + }, + { + "epoch": 0.66, + "learning_rate": 5.5545938578289626e-06, + "loss": 2.5175, + "loss_": 1.0077, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3904, + "step": 5243 + }, + { + "epoch": 0.66, + "learning_rate": 5.529151382980065e-06, + "loss": 2.5567, + "loss_": 0.9865, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3903, + "step": 5250 + }, + { + "epoch": 0.66, + "learning_rate": 5.503745027512608e-06, + "loss": 2.5494, + "loss_": 1.1312, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3903, + "step": 5257 + }, + { + "epoch": 0.66, + "learning_rate": 5.478374996681104e-06, + "loss": 2.511, + "loss_": 0.957, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3903, + "step": 5264 + }, + { + "epoch": 0.66, + "learning_rate": 5.453041495446596e-06, + "loss": 2.5376, + "loss_": 1.1284, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.395, + "step": 5271 + }, + { + "epoch": 0.66, + "learning_rate": 5.427744728475016e-06, + "loss": 2.519, + "loss_": 1.1458, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3904, + "step": 5278 + }, + { + "epoch": 0.66, + "learning_rate": 5.40248490013553e-06, + "loss": 2.5264, + "loss_": 1.0995, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3906, + "step": 5285 + }, + { + "epoch": 0.66, + "learning_rate": 5.3772622144988665e-06, + "loss": 2.5051, + "loss_": 1.1345, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3904, + "step": 5292 + }, + { + "epoch": 0.66, + "learning_rate": 5.352076875335697e-06, + "loss": 2.5742, + "loss_": 0.9607, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3904, + "step": 5299 + }, + { + "epoch": 0.67, + "learning_rate": 5.326929086114972e-06, + "loss": 2.5419, + "loss_": 0.9965, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3903, + "step": 5306 + }, + { + "epoch": 0.67, + "learning_rate": 5.30181905000228e-06, + "loss": 2.5681, + "loss_": 0.7612, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3902, + "step": 5313 + }, + { + "epoch": 0.67, + "learning_rate": 5.276746969858204e-06, + "loss": 2.537, + "loss_": 1.0697, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3903, + "step": 5320 + }, + { + "epoch": 0.67, + "learning_rate": 5.251713048236691e-06, + "loss": 2.5471, + "loss_": 1.0796, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3903, + "step": 5327 + }, + { + "epoch": 0.67, + "learning_rate": 5.226717487383414e-06, + "loss": 2.51, + "loss_": 1.2432, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3903, + "step": 5334 + }, + { + "epoch": 0.67, + "learning_rate": 5.20176048923412e-06, + "loss": 2.5478, + "loss_": 1.0954, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3904, + "step": 5341 + }, + { + "epoch": 0.67, + "learning_rate": 5.176842255413028e-06, + "loss": 2.5248, + "loss_": 1.106, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3903, + "step": 5348 + }, + { + "epoch": 0.67, + "learning_rate": 5.151962987231179e-06, + "loss": 2.5251, + "loss_": 1.1429, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3904, + "step": 5355 + }, + { + "epoch": 0.67, + "learning_rate": 5.127122885684815e-06, + "loss": 2.5393, + "loss_": 1.047, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3903, + "step": 5362 + }, + { + "epoch": 0.67, + "learning_rate": 5.102322151453759e-06, + "loss": 2.5347, + "loss_": 1.1776, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3904, + "step": 5369 + }, + { + "epoch": 0.67, + "learning_rate": 5.077560984899794e-06, + "loss": 2.5264, + "loss_": 1.1651, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3903, + "step": 5376 + }, + { + "epoch": 0.67, + "learning_rate": 5.052839586065027e-06, + "loss": 2.5453, + "loss_": 1.2535, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3903, + "step": 5383 + }, + { + "epoch": 0.68, + "learning_rate": 5.028158154670302e-06, + "loss": 2.5428, + "loss_": 1.1274, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3905, + "step": 5390 + }, + { + "epoch": 0.68, + "learning_rate": 5.003516890113563e-06, + "loss": 2.5141, + "loss_": 1.1251, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3903, + "step": 5397 + }, + { + "epoch": 0.68, + "learning_rate": 4.978915991468262e-06, + "loss": 2.5363, + "loss_": 1.1698, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3903, + "step": 5404 + }, + { + "epoch": 0.68, + "learning_rate": 4.954355657481722e-06, + "loss": 2.5367, + "loss_": 1.1349, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3902, + "step": 5411 + }, + { + "epoch": 0.68, + "learning_rate": 4.929836086573566e-06, + "loss": 2.5367, + "loss_": 1.2559, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3903, + "step": 5418 + }, + { + "epoch": 0.68, + "learning_rate": 4.905357476834095e-06, + "loss": 2.5303, + "loss_": 1.0117, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3902, + "step": 5425 + }, + { + "epoch": 0.68, + "learning_rate": 4.88092002602268e-06, + "loss": 2.5277, + "loss_": 1.118, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3902, + "step": 5432 + }, + { + "epoch": 0.68, + "learning_rate": 4.856523931566184e-06, + "loss": 2.5355, + "loss_": 1.2879, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3904, + "step": 5439 + }, + { + "epoch": 0.68, + "learning_rate": 4.832169390557357e-06, + "loss": 2.5615, + "loss_": 0.9215, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3902, + "step": 5446 + }, + { + "epoch": 0.68, + "learning_rate": 4.807856599753243e-06, + "loss": 2.5715, + "loss_": 1.0914, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3902, + "step": 5453 + }, + { + "epoch": 0.68, + "learning_rate": 4.783585755573589e-06, + "loss": 2.5301, + "loss_": 1.1468, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3902, + "step": 5460 + }, + { + "epoch": 0.69, + "learning_rate": 4.75935705409927e-06, + "loss": 2.5139, + "loss_": 1.2252, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3902, + "step": 5467 + }, + { + "epoch": 0.69, + "learning_rate": 4.735170691070679e-06, + "loss": 2.5219, + "loss_": 0.8784, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3902, + "step": 5474 + }, + { + "epoch": 0.69, + "learning_rate": 4.711026861886176e-06, + "loss": 2.5056, + "loss_": 1.276, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3902, + "step": 5481 + }, + { + "epoch": 0.69, + "learning_rate": 4.686925761600496e-06, + "loss": 2.5303, + "loss_": 1.0171, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3902, + "step": 5488 + }, + { + "epoch": 0.69, + "learning_rate": 4.662867584923169e-06, + "loss": 2.5533, + "loss_": 1.2451, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3901, + "step": 5495 + }, + { + "epoch": 0.69, + "learning_rate": 4.638852526216947e-06, + "loss": 2.5456, + "loss_": 1.0837, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3901, + "step": 5502 + }, + { + "epoch": 0.69, + "learning_rate": 4.614880779496244e-06, + "loss": 2.559, + "loss_": 1.108, + "moe_loss": 0.16, + "moe_loss_longrong": 1.39, + "step": 5509 + }, + { + "epoch": 0.69, + "learning_rate": 4.590952538425563e-06, + "loss": 2.5412, + "loss_": 1.1554, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3901, + "step": 5516 + }, + { + "epoch": 0.69, + "learning_rate": 4.567067996317922e-06, + "loss": 2.5085, + "loss_": 1.0805, + "moe_loss": 0.16, + "moe_loss_longrong": 1.39, + "step": 5523 + }, + { + "epoch": 0.69, + "learning_rate": 4.543227346133312e-06, + "loss": 2.5361, + "loss_": 0.7085, + "moe_loss": 0.16, + "moe_loss_longrong": 1.39, + "step": 5530 + }, + { + "epoch": 0.69, + "learning_rate": 4.519430780477124e-06, + "loss": 2.535, + "loss_": 1.2076, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3903, + "step": 5537 + }, + { + "epoch": 0.69, + "learning_rate": 4.495678491598587e-06, + "loss": 2.5142, + "loss_": 0.8921, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3901, + "step": 5544 + }, + { + "epoch": 0.7, + "learning_rate": 4.471970671389237e-06, + "loss": 2.4935, + "loss_": 0.8663, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3901, + "step": 5551 + }, + { + "epoch": 0.7, + "learning_rate": 4.4483075113813445e-06, + "loss": 2.5257, + "loss_": 1.1068, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3899, + "step": 5558 + }, + { + "epoch": 0.7, + "learning_rate": 4.4246892027463815e-06, + "loss": 2.5516, + "loss_": 1.3583, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3901, + "step": 5565 + }, + { + "epoch": 0.7, + "learning_rate": 4.401115936293468e-06, + "loss": 2.5143, + "loss_": 1.2772, + "moe_loss": 0.16, + "moe_loss_longrong": 1.39, + "step": 5572 + }, + { + "epoch": 0.7, + "learning_rate": 4.377587902467841e-06, + "loss": 2.5213, + "loss_": 1.0711, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3901, + "step": 5579 + }, + { + "epoch": 0.7, + "learning_rate": 4.354105291349301e-06, + "loss": 2.551, + "loss_": 1.0323, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.39, + "step": 5586 + }, + { + "epoch": 0.7, + "learning_rate": 4.330668292650686e-06, + "loss": 2.556, + "loss_": 1.2594, + "moe_loss": 0.16, + "moe_loss_longrong": 1.39, + "step": 5593 + }, + { + "epoch": 0.7, + "learning_rate": 4.3072770957163415e-06, + "loss": 2.5254, + "loss_": 1.1564, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3901, + "step": 5600 + }, + { + "epoch": 0.7, + "learning_rate": 4.283931889520587e-06, + "loss": 2.5109, + "loss_": 1.099, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3901, + "step": 5607 + }, + { + "epoch": 0.7, + "learning_rate": 4.260632862666181e-06, + "loss": 2.5028, + "loss_": 1.2475, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3901, + "step": 5614 + }, + { + "epoch": 0.7, + "learning_rate": 4.237380203382815e-06, + "loss": 2.5318, + "loss_": 1.149, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3899, + "step": 5621 + }, + { + "epoch": 0.71, + "learning_rate": 4.214174099525581e-06, + "loss": 2.5268, + "loss_": 1.2824, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3901, + "step": 5628 + }, + { + "epoch": 0.71, + "learning_rate": 4.191014738573448e-06, + "loss": 2.5064, + "loss_": 1.0164, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3901, + "step": 5635 + }, + { + "epoch": 0.71, + "learning_rate": 4.1679023076277644e-06, + "loss": 2.5413, + "loss_": 1.1083, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3902, + "step": 5642 + }, + { + "epoch": 0.71, + "learning_rate": 4.144836993410739e-06, + "loss": 2.5067, + "loss_": 1.174, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.39, + "step": 5649 + }, + { + "epoch": 0.71, + "learning_rate": 4.12181898226392e-06, + "loss": 2.5303, + "loss_": 1.3489, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.39, + "step": 5656 + }, + { + "epoch": 0.71, + "learning_rate": 4.098848460146709e-06, + "loss": 2.5134, + "loss_": 1.2256, + "moe_loss": 0.16, + "moe_loss_longrong": 1.39, + "step": 5663 + }, + { + "epoch": 0.71, + "learning_rate": 4.07592561263485e-06, + "loss": 2.5352, + "loss_": 1.0677, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3897, + "step": 5670 + }, + { + "epoch": 0.71, + "learning_rate": 4.053050624918927e-06, + "loss": 2.5389, + "loss_": 1.3495, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3902, + "step": 5677 + }, + { + "epoch": 0.71, + "learning_rate": 4.030223681802873e-06, + "loss": 2.5214, + "loss_": 0.9889, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3899, + "step": 5684 + }, + { + "epoch": 0.71, + "learning_rate": 4.007444967702475e-06, + "loss": 2.5118, + "loss_": 0.9998, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3901, + "step": 5691 + }, + { + "epoch": 0.71, + "learning_rate": 3.984714666643887e-06, + "loss": 2.5307, + "loss_": 1.176, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3899, + "step": 5698 + }, + { + "epoch": 0.72, + "learning_rate": 3.962032962262132e-06, + "loss": 2.5366, + "loss_": 1.0218, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3898, + "step": 5705 + }, + { + "epoch": 0.72, + "learning_rate": 3.9394000377996355e-06, + "loss": 2.5117, + "loss_": 0.8796, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3942, + "step": 5712 + }, + { + "epoch": 0.72, + "learning_rate": 3.916816076104737e-06, + "loss": 2.5142, + "loss_": 0.9879, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3941, + "step": 5719 + }, + { + "epoch": 0.72, + "learning_rate": 3.894281259630203e-06, + "loss": 2.505, + "loss_": 1.0209, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.39, + "step": 5726 + }, + { + "epoch": 0.72, + "learning_rate": 3.871795770431772e-06, + "loss": 2.547, + "loss_": 1.0637, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.39, + "step": 5733 + }, + { + "epoch": 0.72, + "learning_rate": 3.84935979016667e-06, + "loss": 2.5078, + "loss_": 1.1125, + "moe_loss": 0.16, + "moe_loss_longrong": 1.39, + "step": 5740 + }, + { + "epoch": 0.72, + "learning_rate": 3.826973500092153e-06, + "loss": 2.5075, + "loss_": 1.0352, + "moe_loss": 0.16, + "moe_loss_longrong": 1.39, + "step": 5747 + }, + { + "epoch": 0.72, + "learning_rate": 3.8046370810640223e-06, + "loss": 2.5161, + "loss_": 1.167, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3899, + "step": 5754 + }, + { + "epoch": 0.72, + "learning_rate": 3.782350713535192e-06, + "loss": 2.5364, + "loss_": 1.0836, + "moe_loss": 0.16, + "moe_loss_longrong": 1.39, + "step": 5761 + }, + { + "epoch": 0.72, + "learning_rate": 3.760114577554216e-06, + "loss": 2.5025, + "loss_": 1.31, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3898, + "step": 5768 + }, + { + "epoch": 0.72, + "learning_rate": 3.7379288527638203e-06, + "loss": 2.4932, + "loss_": 1.139, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3899, + "step": 5775 + }, + { + "epoch": 0.72, + "learning_rate": 3.715793718399482e-06, + "loss": 2.5266, + "loss_": 1.0568, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3899, + "step": 5782 + }, + { + "epoch": 0.73, + "learning_rate": 3.6937093532879576e-06, + "loss": 2.4954, + "loss_": 1.087, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3899, + "step": 5789 + }, + { + "epoch": 0.73, + "learning_rate": 3.6716759358458467e-06, + "loss": 2.5337, + "loss_": 1.1604, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3899, + "step": 5796 + }, + { + "epoch": 0.73, + "learning_rate": 3.6496936440781496e-06, + "loss": 2.5421, + "loss_": 1.2507, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.39, + "step": 5803 + }, + { + "epoch": 0.73, + "learning_rate": 3.6277626555768307e-06, + "loss": 2.5217, + "loss_": 1.1458, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3898, + "step": 5810 + }, + { + "epoch": 0.73, + "learning_rate": 3.605883147519377e-06, + "loss": 2.5247, + "loss_": 1.0988, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3899, + "step": 5817 + }, + { + "epoch": 0.73, + "learning_rate": 3.584055296667377e-06, + "loss": 2.5367, + "loss_": 1.2706, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3898, + "step": 5824 + }, + { + "epoch": 0.73, + "learning_rate": 3.562279279365086e-06, + "loss": 2.4891, + "loss_": 0.9378, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3898, + "step": 5831 + }, + { + "epoch": 0.73, + "learning_rate": 3.5405552715380075e-06, + "loss": 2.5432, + "loss_": 1.1877, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3898, + "step": 5838 + }, + { + "epoch": 0.73, + "learning_rate": 3.518883448691457e-06, + "loss": 2.5668, + "loss_": 1.1926, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3898, + "step": 5845 + }, + { + "epoch": 0.73, + "learning_rate": 3.497263985909163e-06, + "loss": 2.5219, + "loss_": 1.0161, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3898, + "step": 5852 + }, + { + "epoch": 0.73, + "learning_rate": 3.4756970578518456e-06, + "loss": 2.4932, + "loss_": 0.8275, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3899, + "step": 5859 + }, + { + "epoch": 0.74, + "learning_rate": 3.4541828387557953e-06, + "loss": 2.5043, + "loss_": 1.0652, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3897, + "step": 5866 + }, + { + "epoch": 0.74, + "learning_rate": 3.43272150243148e-06, + "loss": 2.5058, + "loss_": 1.0976, + "moe_loss": 0.16, + "moe_loss_longrong": 1.39, + "step": 5873 + }, + { + "epoch": 0.74, + "learning_rate": 3.4113132222621382e-06, + "loss": 2.531, + "loss_": 1.2403, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3899, + "step": 5880 + }, + { + "epoch": 0.74, + "learning_rate": 3.3899581712023644e-06, + "loss": 2.5157, + "loss_": 1.1653, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3901, + "step": 5887 + }, + { + "epoch": 0.74, + "learning_rate": 3.3686565217767307e-06, + "loss": 2.5229, + "loss_": 1.175, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3898, + "step": 5894 + }, + { + "epoch": 0.74, + "learning_rate": 3.347408446078384e-06, + "loss": 2.5029, + "loss_": 0.9687, + "moe_loss": 0.16, + "moe_loss_longrong": 1.39, + "step": 5901 + }, + { + "epoch": 0.74, + "learning_rate": 3.326214115767654e-06, + "loss": 2.5651, + "loss_": 1.1922, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3899, + "step": 5908 + }, + { + "epoch": 0.74, + "learning_rate": 3.3050737020706693e-06, + "loss": 2.5259, + "loss_": 1.145, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3898, + "step": 5915 + }, + { + "epoch": 0.74, + "learning_rate": 3.283987375777974e-06, + "loss": 2.5289, + "loss_": 1.2907, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3898, + "step": 5922 + }, + { + "epoch": 0.74, + "learning_rate": 3.26295530724315e-06, + "loss": 2.5091, + "loss_": 0.8317, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3898, + "step": 5929 + }, + { + "epoch": 0.74, + "learning_rate": 3.2419776663814284e-06, + "loss": 2.5086, + "loss_": 1.212, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3897, + "step": 5936 + }, + { + "epoch": 0.74, + "learning_rate": 3.221054622668337e-06, + "loss": 2.5146, + "loss_": 0.9387, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3899, + "step": 5943 + }, + { + "epoch": 0.75, + "learning_rate": 3.2001863451383186e-06, + "loss": 2.5292, + "loss_": 1.1503, + "moe_loss": 0.16, + "moe_loss_longrong": 1.39, + "step": 5950 + }, + { + "epoch": 0.75, + "learning_rate": 3.1793730023833613e-06, + "loss": 2.5435, + "loss_": 1.1373, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3897, + "step": 5957 + }, + { + "epoch": 0.75, + "learning_rate": 3.1586147625516485e-06, + "loss": 2.5105, + "loss_": 0.9076, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3897, + "step": 5964 + }, + { + "epoch": 0.75, + "learning_rate": 3.1379117933461967e-06, + "loss": 2.5015, + "loss_": 1.1139, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3896, + "step": 5971 + }, + { + "epoch": 0.75, + "learning_rate": 3.117264262023488e-06, + "loss": 2.5259, + "loss_": 1.0666, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3898, + "step": 5978 + }, + { + "epoch": 0.75, + "learning_rate": 3.096672335392139e-06, + "loss": 2.511, + "loss_": 0.959, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3897, + "step": 5985 + }, + { + "epoch": 0.75, + "learning_rate": 3.0761361798115454e-06, + "loss": 2.5324, + "loss_": 1.265, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3898, + "step": 5992 + }, + { + "epoch": 0.75, + "learning_rate": 3.0556559611905236e-06, + "loss": 2.5163, + "loss_": 1.1558, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3899, + "step": 5999 + }, + { + "epoch": 0.75, + "learning_rate": 3.035231844985993e-06, + "loss": 2.5014, + "loss_": 1.2063, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3897, + "step": 6006 + }, + { + "epoch": 0.75, + "learning_rate": 3.014863996201628e-06, + "loss": 2.4744, + "loss_": 0.9746, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3936, + "step": 6013 + }, + { + "epoch": 0.75, + "learning_rate": 2.9945525793865237e-06, + "loss": 2.5197, + "loss_": 0.9524, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3898, + "step": 6020 + }, + { + "epoch": 0.76, + "learning_rate": 2.9742977586338718e-06, + "loss": 2.5209, + "loss_": 1.0009, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3896, + "step": 6027 + }, + { + "epoch": 0.76, + "learning_rate": 2.9540996975796288e-06, + "loss": 2.4865, + "loss_": 0.8742, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3898, + "step": 6034 + }, + { + "epoch": 0.76, + "learning_rate": 2.9339585594012034e-06, + "loss": 2.5309, + "loss_": 0.9582, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3896, + "step": 6041 + }, + { + "epoch": 0.76, + "learning_rate": 2.913874506816119e-06, + "loss": 2.527, + "loss_": 1.2344, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3897, + "step": 6048 + }, + { + "epoch": 0.76, + "learning_rate": 2.8938477020807267e-06, + "loss": 2.524, + "loss_": 1.0626, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3896, + "step": 6055 + }, + { + "epoch": 0.76, + "learning_rate": 2.873878306988874e-06, + "loss": 2.5321, + "loss_": 0.7181, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3939, + "step": 6062 + }, + { + "epoch": 0.76, + "learning_rate": 2.8539664828706002e-06, + "loss": 2.4993, + "loss_": 1.0792, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3898, + "step": 6069 + }, + { + "epoch": 0.76, + "learning_rate": 2.8341123905908406e-06, + "loss": 2.5386, + "loss_": 1.1162, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3896, + "step": 6076 + }, + { + "epoch": 0.76, + "learning_rate": 2.8143161905481277e-06, + "loss": 2.5657, + "loss_": 1.2773, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3897, + "step": 6083 + }, + { + "epoch": 0.76, + "learning_rate": 2.7945780426732773e-06, + "loss": 2.5597, + "loss_": 1.1365, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3897, + "step": 6090 + }, + { + "epoch": 0.76, + "learning_rate": 2.77489810642812e-06, + "loss": 2.5335, + "loss_": 1.1916, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6097 + }, + { + "epoch": 0.77, + "learning_rate": 2.7552765408042003e-06, + "loss": 2.5169, + "loss_": 1.1662, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3897, + "step": 6104 + }, + { + "epoch": 0.77, + "learning_rate": 2.7357135043214954e-06, + "loss": 2.5135, + "loss_": 1.0476, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3898, + "step": 6111 + }, + { + "epoch": 0.77, + "learning_rate": 2.7162091550271273e-06, + "loss": 2.4995, + "loss_": 1.1279, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3896, + "step": 6118 + }, + { + "epoch": 0.77, + "learning_rate": 2.6967636504940995e-06, + "loss": 2.54, + "loss_": 1.1283, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3896, + "step": 6125 + }, + { + "epoch": 0.77, + "learning_rate": 2.677377147820013e-06, + "loss": 2.5405, + "loss_": 0.896, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3897, + "step": 6132 + }, + { + "epoch": 0.77, + "learning_rate": 2.6580498036258016e-06, + "loss": 2.5475, + "loss_": 1.0748, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6139 + }, + { + "epoch": 0.77, + "learning_rate": 2.6387817740544665e-06, + "loss": 2.5046, + "loss_": 1.0242, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3896, + "step": 6146 + }, + { + "epoch": 0.77, + "learning_rate": 2.6195732147698148e-06, + "loss": 2.5553, + "loss_": 0.9529, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3936, + "step": 6153 + }, + { + "epoch": 0.77, + "learning_rate": 2.600424280955196e-06, + "loss": 2.5311, + "loss_": 1.1188, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3896, + "step": 6160 + }, + { + "epoch": 0.77, + "learning_rate": 2.581335127312257e-06, + "loss": 2.4974, + "loss_": 0.9955, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3897, + "step": 6167 + }, + { + "epoch": 0.77, + "learning_rate": 2.562305908059691e-06, + "loss": 2.5107, + "loss_": 1.0771, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3896, + "step": 6174 + }, + { + "epoch": 0.77, + "learning_rate": 2.5433367769319894e-06, + "loss": 2.5161, + "loss_": 1.1527, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3896, + "step": 6181 + }, + { + "epoch": 0.78, + "learning_rate": 2.5244278871781924e-06, + "loss": 2.5067, + "loss_": 0.7494, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3896, + "step": 6188 + }, + { + "epoch": 0.78, + "learning_rate": 2.505579391560665e-06, + "loss": 2.5101, + "loss_": 1.05, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3897, + "step": 6195 + }, + { + "epoch": 0.78, + "learning_rate": 2.4867914423538596e-06, + "loss": 2.505, + "loss_": 0.8529, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3931, + "step": 6202 + }, + { + "epoch": 0.78, + "learning_rate": 2.4680641913430703e-06, + "loss": 2.5413, + "loss_": 1.0996, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3897, + "step": 6209 + }, + { + "epoch": 0.78, + "learning_rate": 2.449397789823229e-06, + "loss": 2.5299, + "loss_": 1.198, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3896, + "step": 6216 + }, + { + "epoch": 0.78, + "learning_rate": 2.4307923885976724e-06, + "loss": 2.5472, + "loss_": 1.1477, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3897, + "step": 6223 + }, + { + "epoch": 0.78, + "learning_rate": 2.4122481379769157e-06, + "loss": 2.5024, + "loss_": 1.329, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3896, + "step": 6230 + }, + { + "epoch": 0.78, + "learning_rate": 2.3937651877774537e-06, + "loss": 2.5363, + "loss_": 1.3376, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3897, + "step": 6237 + }, + { + "epoch": 0.78, + "learning_rate": 2.3753436873205437e-06, + "loss": 2.5159, + "loss_": 0.9258, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3896, + "step": 6244 + }, + { + "epoch": 0.78, + "learning_rate": 2.356983785430996e-06, + "loss": 2.5133, + "loss_": 1.1375, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6251 + }, + { + "epoch": 0.78, + "learning_rate": 2.338685630435975e-06, + "loss": 2.5141, + "loss_": 1.0395, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6258 + }, + { + "epoch": 0.79, + "learning_rate": 2.320449370163802e-06, + "loss": 2.5141, + "loss_": 1.2221, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3896, + "step": 6265 + }, + { + "epoch": 0.79, + "learning_rate": 2.30227515194276e-06, + "loss": 2.5207, + "loss_": 1.1959, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6272 + }, + { + "epoch": 0.79, + "learning_rate": 2.284163122599895e-06, + "loss": 2.5455, + "loss_": 1.0789, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6279 + }, + { + "epoch": 0.79, + "learning_rate": 2.2661134284598442e-06, + "loss": 2.507, + "loss_": 1.1284, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3898, + "step": 6286 + }, + { + "epoch": 0.79, + "learning_rate": 2.248126215343651e-06, + "loss": 2.5232, + "loss_": 1.0936, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6293 + }, + { + "epoch": 0.79, + "learning_rate": 2.230201628567572e-06, + "loss": 2.5369, + "loss_": 1.2088, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6300 + }, + { + "epoch": 0.79, + "learning_rate": 2.2123398129419214e-06, + "loss": 2.5085, + "loss_": 1.3623, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3897, + "step": 6307 + }, + { + "epoch": 0.79, + "learning_rate": 2.1945409127698967e-06, + "loss": 2.5114, + "loss_": 1.109, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6314 + }, + { + "epoch": 0.79, + "learning_rate": 2.1768050718464006e-06, + "loss": 2.5095, + "loss_": 1.1855, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3896, + "step": 6321 + }, + { + "epoch": 0.79, + "learning_rate": 2.1591324334568943e-06, + "loss": 2.5014, + "loss_": 1.1081, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6328 + }, + { + "epoch": 0.79, + "learning_rate": 2.1415231403762383e-06, + "loss": 2.4978, + "loss_": 0.9622, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6335 + }, + { + "epoch": 0.79, + "learning_rate": 2.123977334867523e-06, + "loss": 2.5578, + "loss_": 1.4506, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6342 + }, + { + "epoch": 0.8, + "learning_rate": 2.1064951586809434e-06, + "loss": 2.5026, + "loss_": 1.1986, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6349 + }, + { + "epoch": 0.8, + "learning_rate": 2.0890767530526358e-06, + "loss": 2.5363, + "loss_": 1.1286, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3897, + "step": 6356 + }, + { + "epoch": 0.8, + "learning_rate": 2.0717222587035435e-06, + "loss": 2.5241, + "loss_": 0.963, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6363 + }, + { + "epoch": 0.8, + "learning_rate": 2.0544318158382815e-06, + "loss": 2.5148, + "loss_": 0.9621, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6370 + }, + { + "epoch": 0.8, + "learning_rate": 2.037205564143999e-06, + "loss": 2.5373, + "loss_": 1.147, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6377 + }, + { + "epoch": 0.8, + "learning_rate": 2.0200436427892554e-06, + "loss": 2.5173, + "loss_": 1.198, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6384 + }, + { + "epoch": 0.8, + "learning_rate": 2.0029461904228896e-06, + "loss": 2.5232, + "loss_": 1.058, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6391 + }, + { + "epoch": 0.8, + "learning_rate": 1.9859133451729094e-06, + "loss": 2.5238, + "loss_": 1.14, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6398 + }, + { + "epoch": 0.8, + "learning_rate": 1.9689452446453693e-06, + "loss": 2.5138, + "loss_": 1.137, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3896, + "step": 6405 + }, + { + "epoch": 0.8, + "learning_rate": 1.9520420259232566e-06, + "loss": 2.5304, + "loss_": 1.024, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6412 + }, + { + "epoch": 0.8, + "learning_rate": 1.9352038255653893e-06, + "loss": 2.514, + "loss_": 0.6869, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3896, + "step": 6419 + }, + { + "epoch": 0.81, + "learning_rate": 1.918430779605317e-06, + "loss": 2.5432, + "loss_": 1.1982, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6426 + }, + { + "epoch": 0.81, + "learning_rate": 1.9017230235502027e-06, + "loss": 2.5134, + "loss_": 0.7847, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6433 + }, + { + "epoch": 0.81, + "learning_rate": 1.8850806923797516e-06, + "loss": 2.5159, + "loss_": 1.457, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6440 + }, + { + "epoch": 0.81, + "learning_rate": 1.8685039205451072e-06, + "loss": 2.5284, + "loss_": 0.9184, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6447 + }, + { + "epoch": 0.81, + "learning_rate": 1.8519928419677703e-06, + "loss": 2.5196, + "loss_": 1.1945, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 6454 + }, + { + "epoch": 0.81, + "learning_rate": 1.8355475900385056e-06, + "loss": 2.5399, + "loss_": 1.0441, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6461 + }, + { + "epoch": 0.81, + "learning_rate": 1.819168297616284e-06, + "loss": 2.4934, + "loss_": 0.9845, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3931, + "step": 6468 + }, + { + "epoch": 0.81, + "learning_rate": 1.802855097027194e-06, + "loss": 2.4904, + "loss_": 1.1509, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6475 + }, + { + "epoch": 0.81, + "learning_rate": 1.7866081200633756e-06, + "loss": 2.4643, + "loss_": 1.0501, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6482 + }, + { + "epoch": 0.81, + "learning_rate": 1.7704274979819614e-06, + "loss": 2.5138, + "loss_": 1.171, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6489 + }, + { + "epoch": 0.81, + "learning_rate": 1.7543133615040098e-06, + "loss": 2.5229, + "loss_": 1.4327, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3896, + "step": 6496 + }, + { + "epoch": 0.82, + "learning_rate": 1.7382658408134467e-06, + "loss": 2.5343, + "loss_": 1.0981, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6503 + }, + { + "epoch": 0.82, + "learning_rate": 1.7222850655560241e-06, + "loss": 2.5169, + "loss_": 1.0696, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6510 + }, + { + "epoch": 0.82, + "learning_rate": 1.7063711648382665e-06, + "loss": 2.5251, + "loss_": 1.1954, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6517 + }, + { + "epoch": 0.82, + "learning_rate": 1.690524267226421e-06, + "loss": 2.5307, + "loss_": 1.1597, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6524 + }, + { + "epoch": 0.82, + "learning_rate": 1.6747445007454333e-06, + "loss": 2.5079, + "loss_": 1.1213, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6531 + }, + { + "epoch": 0.82, + "learning_rate": 1.659031992877903e-06, + "loss": 2.5285, + "loss_": 1.0738, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6538 + }, + { + "epoch": 0.82, + "learning_rate": 1.6433868705630584e-06, + "loss": 2.517, + "loss_": 0.7318, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6545 + }, + { + "epoch": 0.82, + "learning_rate": 1.6278092601957241e-06, + "loss": 2.5266, + "loss_": 1.0867, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3896, + "step": 6552 + }, + { + "epoch": 0.82, + "learning_rate": 1.6122992876253086e-06, + "loss": 2.5332, + "loss_": 1.0809, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6559 + }, + { + "epoch": 0.82, + "learning_rate": 1.5968570781547864e-06, + "loss": 2.5117, + "loss_": 1.2174, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6566 + }, + { + "epoch": 0.82, + "learning_rate": 1.581482756539674e-06, + "loss": 2.517, + "loss_": 1.3342, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6573 + }, + { + "epoch": 0.82, + "learning_rate": 1.5661764469870412e-06, + "loss": 2.5286, + "loss_": 0.9451, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6580 + }, + { + "epoch": 0.83, + "learning_rate": 1.5509382731544908e-06, + "loss": 2.5163, + "loss_": 1.1528, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6587 + }, + { + "epoch": 0.83, + "learning_rate": 1.53576835814917e-06, + "loss": 2.5028, + "loss_": 0.835, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3933, + "step": 6594 + }, + { + "epoch": 0.83, + "learning_rate": 1.5206668245267709e-06, + "loss": 2.5126, + "loss_": 0.8754, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6601 + }, + { + "epoch": 0.83, + "learning_rate": 1.5056337942905408e-06, + "loss": 2.5059, + "loss_": 0.9523, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 6608 + }, + { + "epoch": 0.83, + "learning_rate": 1.4906693888903022e-06, + "loss": 2.5235, + "loss_": 1.1009, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6615 + }, + { + "epoch": 0.83, + "learning_rate": 1.475773729221457e-06, + "loss": 2.5315, + "loss_": 1.1289, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6622 + }, + { + "epoch": 0.83, + "learning_rate": 1.460946935624027e-06, + "loss": 2.5179, + "loss_": 0.9098, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6629 + }, + { + "epoch": 0.83, + "learning_rate": 1.4461891278816775e-06, + "loss": 2.5291, + "loss_": 1.1364, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6636 + }, + { + "epoch": 0.83, + "learning_rate": 1.4315004252207354e-06, + "loss": 2.5287, + "loss_": 1.0815, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6643 + }, + { + "epoch": 0.83, + "learning_rate": 1.4168809463092459e-06, + "loss": 2.5112, + "loss_": 0.9575, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6650 + }, + { + "epoch": 0.83, + "learning_rate": 1.402330809256005e-06, + "loss": 2.5271, + "loss_": 1.1014, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6657 + }, + { + "epoch": 0.84, + "learning_rate": 1.387850131609597e-06, + "loss": 2.4711, + "loss_": 1.0401, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 6664 + }, + { + "epoch": 0.84, + "learning_rate": 1.3734390303574619e-06, + "loss": 2.5261, + "loss_": 1.2037, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 6671 + }, + { + "epoch": 0.84, + "learning_rate": 1.3590976219249386e-06, + "loss": 2.5267, + "loss_": 1.1024, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6678 + }, + { + "epoch": 0.84, + "learning_rate": 1.3448260221743249e-06, + "loss": 2.5327, + "loss_": 1.1507, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 6685 + }, + { + "epoch": 0.84, + "learning_rate": 1.3306243464039458e-06, + "loss": 2.5205, + "loss_": 1.2929, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6692 + }, + { + "epoch": 0.84, + "learning_rate": 1.3164927093472235e-06, + "loss": 2.5205, + "loss_": 1.1681, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 6699 + }, + { + "epoch": 0.84, + "learning_rate": 1.3024312251717365e-06, + "loss": 2.5222, + "loss_": 1.2896, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6706 + }, + { + "epoch": 0.84, + "learning_rate": 1.2884400074783176e-06, + "loss": 2.482, + "loss_": 1.4715, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6713 + }, + { + "epoch": 0.84, + "learning_rate": 1.2745191693001214e-06, + "loss": 2.5152, + "loss_": 1.1018, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6720 + }, + { + "epoch": 0.84, + "learning_rate": 1.2606688231017205e-06, + "loss": 2.4911, + "loss_": 0.8081, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6727 + }, + { + "epoch": 0.84, + "learning_rate": 1.246889080778184e-06, + "loss": 2.5173, + "loss_": 1.234, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6734 + }, + { + "epoch": 0.84, + "learning_rate": 1.2331800536541894e-06, + "loss": 2.5114, + "loss_": 1.3323, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6741 + }, + { + "epoch": 0.85, + "learning_rate": 1.219541852483115e-06, + "loss": 2.5135, + "loss_": 1.0719, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 6748 + }, + { + "epoch": 0.85, + "learning_rate": 1.2059745874461403e-06, + "loss": 2.5229, + "loss_": 1.1145, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6755 + }, + { + "epoch": 0.85, + "learning_rate": 1.1924783681513664e-06, + "loss": 2.5145, + "loss_": 1.0924, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 6762 + }, + { + "epoch": 0.85, + "learning_rate": 1.1790533036329265e-06, + "loss": 2.5242, + "loss_": 1.1827, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6769 + }, + { + "epoch": 0.85, + "learning_rate": 1.1656995023500971e-06, + "loss": 2.51, + "loss_": 0.9651, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6776 + }, + { + "epoch": 0.85, + "learning_rate": 1.1524170721864358e-06, + "loss": 2.5144, + "loss_": 1.4801, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6783 + }, + { + "epoch": 0.85, + "learning_rate": 1.139206120448899e-06, + "loss": 2.4961, + "loss_": 0.9054, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 6790 + }, + { + "epoch": 0.85, + "learning_rate": 1.12606675386698e-06, + "loss": 2.5091, + "loss_": 1.19, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6797 + }, + { + "epoch": 0.85, + "learning_rate": 1.1129990785918444e-06, + "loss": 2.5346, + "loss_": 0.9222, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 6804 + }, + { + "epoch": 0.85, + "learning_rate": 1.100003200195474e-06, + "loss": 2.5121, + "loss_": 0.9459, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6811 + }, + { + "epoch": 0.85, + "learning_rate": 1.0870792236698157e-06, + "loss": 2.5331, + "loss_": 1.1242, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6818 + }, + { + "epoch": 0.86, + "learning_rate": 1.0742272534259234e-06, + "loss": 2.5094, + "loss_": 1.0776, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6825 + }, + { + "epoch": 0.86, + "learning_rate": 1.061447393293129e-06, + "loss": 2.5038, + "loss_": 0.8106, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 6832 + }, + { + "epoch": 0.86, + "learning_rate": 1.048739746518197e-06, + "loss": 2.5264, + "loss_": 1.0267, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6839 + }, + { + "epoch": 0.86, + "learning_rate": 1.0361044157644828e-06, + "loss": 2.4963, + "loss_": 0.8518, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6846 + }, + { + "epoch": 0.86, + "learning_rate": 1.0235415031111173e-06, + "loss": 2.5199, + "loss_": 0.9971, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 6853 + }, + { + "epoch": 0.86, + "learning_rate": 1.0110511100521747e-06, + "loss": 2.5356, + "loss_": 0.9752, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6860 + }, + { + "epoch": 0.86, + "learning_rate": 9.98633337495848e-07, + "loss": 2.5046, + "loss_": 1.149, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 6867 + }, + { + "epoch": 0.86, + "learning_rate": 9.862882857636446e-07, + "loss": 2.5399, + "loss_": 1.2307, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 6874 + }, + { + "epoch": 0.86, + "learning_rate": 9.740160545895683e-07, + "loss": 2.506, + "loss_": 1.0997, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 6881 + }, + { + "epoch": 0.86, + "learning_rate": 9.61816743119317e-07, + "loss": 2.4961, + "loss_": 0.9872, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3931, + "step": 6888 + }, + { + "epoch": 0.86, + "learning_rate": 9.49690449909475e-07, + "loss": 2.5033, + "loss_": 0.8414, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3932, + "step": 6895 + }, + { + "epoch": 0.87, + "learning_rate": 9.376372729267269e-07, + "loss": 2.5298, + "loss_": 1.2211, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6902 + }, + { + "epoch": 0.87, + "learning_rate": 9.256573095470601e-07, + "loss": 2.5483, + "loss_": 0.8932, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6909 + }, + { + "epoch": 0.87, + "learning_rate": 9.137506565549791e-07, + "loss": 2.493, + "loss_": 1.1945, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 6916 + }, + { + "epoch": 0.87, + "learning_rate": 9.019174101427219e-07, + "loss": 2.5231, + "loss_": 1.1885, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 6923 + }, + { + "epoch": 0.87, + "learning_rate": 8.901576659094901e-07, + "loss": 2.5306, + "loss_": 1.0664, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6930 + }, + { + "epoch": 0.87, + "learning_rate": 8.784715188606629e-07, + "loss": 2.5236, + "loss_": 1.3339, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 6937 + }, + { + "epoch": 0.87, + "learning_rate": 8.668590634070428e-07, + "loss": 2.5428, + "loss_": 0.9196, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3933, + "step": 6944 + }, + { + "epoch": 0.87, + "learning_rate": 8.553203933640908e-07, + "loss": 2.5645, + "loss_": 1.1573, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 6951 + }, + { + "epoch": 0.87, + "learning_rate": 8.438556019511568e-07, + "loss": 2.5178, + "loss_": 0.9821, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 6958 + }, + { + "epoch": 0.87, + "learning_rate": 8.324647817907427e-07, + "loss": 2.563, + "loss_": 1.2098, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 6965 + }, + { + "epoch": 0.87, + "learning_rate": 8.211480249077441e-07, + "loss": 2.5238, + "loss_": 1.3804, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 6972 + }, + { + "epoch": 0.87, + "learning_rate": 8.099054227287129e-07, + "loss": 2.5693, + "loss_": 1.077, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 6979 + }, + { + "epoch": 0.88, + "learning_rate": 7.987370660811066e-07, + "loss": 2.5288, + "loss_": 0.8073, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 6986 + }, + { + "epoch": 0.88, + "learning_rate": 7.87643045192571e-07, + "loss": 2.501, + "loss_": 1.0172, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 6993 + }, + { + "epoch": 0.88, + "learning_rate": 7.766234496902025e-07, + "loss": 2.5408, + "loss_": 1.3127, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7000 + }, + { + "epoch": 0.88, + "learning_rate": 7.656783685998192e-07, + "loss": 2.5051, + "loss_": 0.7769, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7007 + }, + { + "epoch": 0.88, + "learning_rate": 7.548078903452527e-07, + "loss": 2.5057, + "loss_": 1.2042, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7014 + }, + { + "epoch": 0.88, + "learning_rate": 7.440121027476288e-07, + "loss": 2.5155, + "loss_": 1.0635, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7021 + }, + { + "epoch": 0.88, + "learning_rate": 7.332910930246528e-07, + "loss": 2.521, + "loss_": 1.1436, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7028 + }, + { + "epoch": 0.88, + "learning_rate": 7.226449477899156e-07, + "loss": 2.5023, + "loss_": 1.1022, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7035 + }, + { + "epoch": 0.88, + "learning_rate": 7.120737530521826e-07, + "loss": 2.5197, + "loss_": 0.9678, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7042 + }, + { + "epoch": 0.88, + "learning_rate": 7.015775942147107e-07, + "loss": 2.4997, + "loss_": 1.1735, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7049 + }, + { + "epoch": 0.88, + "learning_rate": 6.911565560745414e-07, + "loss": 2.5206, + "loss_": 1.1128, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7056 + }, + { + "epoch": 0.89, + "learning_rate": 6.808107228218375e-07, + "loss": 2.5601, + "loss_": 1.2738, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7063 + }, + { + "epoch": 0.89, + "learning_rate": 6.705401780391862e-07, + "loss": 2.5242, + "loss_": 0.8098, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3924, + "step": 7070 + }, + { + "epoch": 0.89, + "learning_rate": 6.603450047009286e-07, + "loss": 2.5201, + "loss_": 1.3133, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7077 + }, + { + "epoch": 0.89, + "learning_rate": 6.502252851724922e-07, + "loss": 2.5253, + "loss_": 1.1757, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7084 + }, + { + "epoch": 0.89, + "learning_rate": 6.401811012097248e-07, + "loss": 2.5515, + "loss_": 1.1474, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3891, + "step": 7091 + }, + { + "epoch": 0.89, + "learning_rate": 6.302125339582266e-07, + "loss": 2.5258, + "loss_": 0.9865, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 7098 + }, + { + "epoch": 0.89, + "learning_rate": 6.203196639527065e-07, + "loss": 2.5225, + "loss_": 1.2881, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7105 + }, + { + "epoch": 0.89, + "learning_rate": 6.105025711163249e-07, + "loss": 2.4979, + "loss_": 1.1086, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7112 + }, + { + "epoch": 0.89, + "learning_rate": 6.007613347600438e-07, + "loss": 2.5174, + "loss_": 0.6185, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7119 + }, + { + "epoch": 0.89, + "learning_rate": 5.910960335819982e-07, + "loss": 2.5157, + "loss_": 0.8674, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7126 + }, + { + "epoch": 0.89, + "learning_rate": 5.815067456668467e-07, + "loss": 2.5212, + "loss_": 1.2026, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7133 + }, + { + "epoch": 0.89, + "learning_rate": 5.719935484851513e-07, + "loss": 2.5215, + "loss_": 1.1283, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7140 + }, + { + "epoch": 0.9, + "learning_rate": 5.625565188927462e-07, + "loss": 2.4856, + "loss_": 1.2003, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7147 + }, + { + "epoch": 0.9, + "learning_rate": 5.531957331301152e-07, + "loss": 2.5027, + "loss_": 1.234, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7154 + }, + { + "epoch": 0.9, + "learning_rate": 5.43911266821785e-07, + "loss": 2.527, + "loss_": 1.2273, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7161 + }, + { + "epoch": 0.9, + "learning_rate": 5.347031949756987e-07, + "loss": 2.5546, + "loss_": 1.1449, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7168 + }, + { + "epoch": 0.9, + "learning_rate": 5.255715919826254e-07, + "loss": 2.5321, + "loss_": 1.0256, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 7175 + }, + { + "epoch": 0.9, + "learning_rate": 5.165165316155519e-07, + "loss": 2.5126, + "loss_": 0.9853, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7182 + }, + { + "epoch": 0.9, + "learning_rate": 5.075380870290847e-07, + "loss": 2.5047, + "loss_": 1.0876, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3935, + "step": 7189 + }, + { + "epoch": 0.9, + "learning_rate": 4.986363307588648e-07, + "loss": 2.5314, + "loss_": 1.1338, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7196 + }, + { + "epoch": 0.9, + "learning_rate": 4.898113347209788e-07, + "loss": 2.493, + "loss_": 1.0642, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7203 + }, + { + "epoch": 0.9, + "learning_rate": 4.810631702113722e-07, + "loss": 2.4985, + "loss_": 0.9782, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7210 + }, + { + "epoch": 0.9, + "learning_rate": 4.723919079052874e-07, + "loss": 2.5288, + "loss_": 1.4337, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7217 + }, + { + "epoch": 0.91, + "learning_rate": 4.637976178566772e-07, + "loss": 2.5036, + "loss_": 1.2508, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7224 + }, + { + "epoch": 0.91, + "learning_rate": 4.5528036949765155e-07, + "loss": 2.5172, + "loss_": 1.0779, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3891, + "step": 7231 + }, + { + "epoch": 0.91, + "learning_rate": 4.46840231637905e-07, + "loss": 2.4901, + "loss_": 1.1486, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3891, + "step": 7238 + }, + { + "epoch": 0.91, + "learning_rate": 4.3847727246417283e-07, + "loss": 2.5265, + "loss_": 1.089, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7245 + }, + { + "epoch": 0.91, + "learning_rate": 4.3019155953966995e-07, + "loss": 2.534, + "loss_": 1.0542, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7252 + }, + { + "epoch": 0.91, + "learning_rate": 4.2198315980355066e-07, + "loss": 2.4964, + "loss_": 1.2698, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7259 + }, + { + "epoch": 0.91, + "learning_rate": 4.1385213957036763e-07, + "loss": 2.4997, + "loss_": 1.0581, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7266 + }, + { + "epoch": 0.91, + "learning_rate": 4.057985645295337e-07, + "loss": 2.5273, + "loss_": 1.1317, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7273 + }, + { + "epoch": 0.91, + "learning_rate": 3.9782249974479105e-07, + "loss": 2.506, + "loss_": 1.1506, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7280 + }, + { + "epoch": 0.91, + "learning_rate": 3.899240096536905e-07, + "loss": 2.5387, + "loss_": 1.1072, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7287 + }, + { + "epoch": 0.91, + "learning_rate": 3.8210315806706535e-07, + "loss": 2.5092, + "loss_": 0.8953, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7294 + }, + { + "epoch": 0.92, + "learning_rate": 3.7436000816851504e-07, + "loss": 2.5046, + "loss_": 1.1341, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7301 + }, + { + "epoch": 0.92, + "learning_rate": 3.666946225139045e-07, + "loss": 2.5416, + "loss_": 1.2474, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7308 + }, + { + "epoch": 0.92, + "learning_rate": 3.5910706303084574e-07, + "loss": 2.521, + "loss_": 0.4458, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7315 + }, + { + "epoch": 0.92, + "learning_rate": 3.515973910182069e-07, + "loss": 2.5129, + "loss_": 1.0651, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7322 + }, + { + "epoch": 0.92, + "learning_rate": 3.4416566714561174e-07, + "loss": 2.5029, + "loss_": 1.0087, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7329 + }, + { + "epoch": 0.92, + "learning_rate": 3.368119514529533e-07, + "loss": 2.5211, + "loss_": 1.3133, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7336 + }, + { + "epoch": 0.92, + "learning_rate": 3.295363033499066e-07, + "loss": 2.5431, + "loss_": 1.005, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7343 + }, + { + "epoch": 0.92, + "learning_rate": 3.223387816154466e-07, + "loss": 2.5257, + "loss_": 1.1226, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7350 + }, + { + "epoch": 0.92, + "learning_rate": 3.1521944439738104e-07, + "loss": 2.52, + "loss_": 1.0687, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7357 + }, + { + "epoch": 0.92, + "learning_rate": 3.081783492118706e-07, + "loss": 2.5225, + "loss_": 1.1186, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7364 + }, + { + "epoch": 0.92, + "learning_rate": 3.012155529429728e-07, + "loss": 2.5037, + "loss_": 1.177, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7371 + }, + { + "epoch": 0.92, + "learning_rate": 2.9433111184217656e-07, + "loss": 2.5031, + "loss_": 1.2232, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7378 + }, + { + "epoch": 0.93, + "learning_rate": 2.875250815279518e-07, + "loss": 2.525, + "loss_": 1.0972, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7385 + }, + { + "epoch": 0.93, + "learning_rate": 2.807975169852939e-07, + "loss": 2.5529, + "loss_": 0.9882, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7392 + }, + { + "epoch": 0.93, + "learning_rate": 2.7414847256528985e-07, + "loss": 2.5408, + "loss_": 1.3431, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7399 + }, + { + "epoch": 0.93, + "learning_rate": 2.675780019846697e-07, + "loss": 2.5362, + "loss_": 1.0673, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7406 + }, + { + "epoch": 0.93, + "learning_rate": 2.6108615832537765e-07, + "loss": 2.4947, + "loss_": 1.136, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7413 + }, + { + "epoch": 0.93, + "learning_rate": 2.546729940341386e-07, + "loss": 2.4788, + "loss_": 1.2069, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7420 + }, + { + "epoch": 0.93, + "learning_rate": 2.4833856092204124e-07, + "loss": 2.5045, + "loss_": 1.1985, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7427 + }, + { + "epoch": 0.93, + "learning_rate": 2.4208291016411536e-07, + "loss": 2.5433, + "loss_": 1.0931, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7434 + }, + { + "epoch": 0.93, + "learning_rate": 2.3590609229891537e-07, + "loss": 2.5113, + "loss_": 1.2254, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7441 + }, + { + "epoch": 0.93, + "learning_rate": 2.2980815722811855e-07, + "loss": 2.5482, + "loss_": 1.243, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7448 + }, + { + "epoch": 0.93, + "learning_rate": 2.2378915421611746e-07, + "loss": 2.5383, + "loss_": 1.2111, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7455 + }, + { + "epoch": 0.94, + "learning_rate": 2.1784913188962365e-07, + "loss": 2.5388, + "loss_": 1.0636, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7462 + }, + { + "epoch": 0.94, + "learning_rate": 2.119881382372746e-07, + "loss": 2.502, + "loss_": 1.0249, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7469 + }, + { + "epoch": 0.94, + "learning_rate": 2.0620622060924522e-07, + "loss": 2.5066, + "loss_": 0.847, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7476 + }, + { + "epoch": 0.94, + "learning_rate": 2.0050342571686589e-07, + "loss": 2.5093, + "loss_": 1.1436, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7483 + }, + { + "epoch": 0.94, + "learning_rate": 1.9487979963224712e-07, + "loss": 2.5268, + "loss_": 1.0466, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7490 + }, + { + "epoch": 0.94, + "learning_rate": 1.8933538778790118e-07, + "loss": 2.5413, + "loss_": 1.3326, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7497 + }, + { + "epoch": 0.94, + "learning_rate": 1.8387023497638324e-07, + "loss": 2.5318, + "loss_": 1.1098, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7504 + }, + { + "epoch": 0.94, + "learning_rate": 1.7848438534992407e-07, + "loss": 2.5091, + "loss_": 0.8163, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7511 + }, + { + "epoch": 0.94, + "learning_rate": 1.7317788242007361e-07, + "loss": 2.5119, + "loss_": 0.9991, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7518 + }, + { + "epoch": 0.94, + "learning_rate": 1.679507690573523e-07, + "loss": 2.5155, + "loss_": 1.0626, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7525 + }, + { + "epoch": 0.94, + "learning_rate": 1.6280308749090036e-07, + "loss": 2.5254, + "loss_": 1.207, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7532 + }, + { + "epoch": 0.95, + "learning_rate": 1.5773487930814345e-07, + "loss": 2.516, + "loss_": 1.1184, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7539 + }, + { + "epoch": 0.95, + "learning_rate": 1.5274618545444985e-07, + "loss": 2.5236, + "loss_": 1.3532, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7546 + }, + { + "epoch": 0.95, + "learning_rate": 1.4783704623280048e-07, + "loss": 2.53, + "loss_": 1.1781, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7553 + }, + { + "epoch": 0.95, + "learning_rate": 1.430075013034693e-07, + "loss": 2.5614, + "loss_": 1.17, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7560 + }, + { + "epoch": 0.95, + "learning_rate": 1.3825758968369684e-07, + "loss": 2.5444, + "loss_": 1.2882, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7567 + }, + { + "epoch": 0.95, + "learning_rate": 1.335873497473761e-07, + "loss": 2.5334, + "loss_": 1.429, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7574 + }, + { + "epoch": 0.95, + "learning_rate": 1.2899681922474482e-07, + "loss": 2.5562, + "loss_": 1.0573, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7581 + }, + { + "epoch": 0.95, + "learning_rate": 1.2448603520207603e-07, + "loss": 2.4816, + "loss_": 1.1828, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7588 + }, + { + "epoch": 0.95, + "learning_rate": 1.2005503412138685e-07, + "loss": 2.5387, + "loss_": 1.1538, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7595 + }, + { + "epoch": 0.95, + "learning_rate": 1.1570385178013454e-07, + "loss": 2.5206, + "loss_": 0.9157, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7602 + }, + { + "epoch": 0.95, + "learning_rate": 1.1143252333093213e-07, + "loss": 2.4838, + "loss_": 0.7925, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7609 + }, + { + "epoch": 0.95, + "learning_rate": 1.0724108328126647e-07, + "loss": 2.5101, + "loss_": 1.2665, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7616 + }, + { + "epoch": 0.96, + "learning_rate": 1.0312956549321407e-07, + "loss": 2.501, + "loss_": 1.0971, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7623 + }, + { + "epoch": 0.96, + "learning_rate": 9.909800318317008e-08, + "loss": 2.5096, + "loss_": 1.2361, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7630 + }, + { + "epoch": 0.96, + "learning_rate": 9.51464289215831e-08, + "loss": 2.5052, + "loss_": 0.8553, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7637 + }, + { + "epoch": 0.96, + "learning_rate": 9.127487463268636e-08, + "loss": 2.5264, + "loss_": 0.9662, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3929, + "step": 7644 + }, + { + "epoch": 0.96, + "learning_rate": 8.748337159424247e-08, + "loss": 2.5115, + "loss_": 0.9187, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7651 + }, + { + "epoch": 0.96, + "learning_rate": 8.377195043729358e-08, + "loss": 2.519, + "loss_": 1.3642, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7658 + }, + { + "epoch": 0.96, + "learning_rate": 8.014064114590936e-08, + "loss": 2.5068, + "loss_": 1.052, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7665 + }, + { + "epoch": 0.96, + "learning_rate": 7.658947305694497e-08, + "loss": 2.4804, + "loss_": 1.1747, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7672 + }, + { + "epoch": 0.96, + "learning_rate": 7.311847485980794e-08, + "loss": 2.5175, + "loss_": 1.0972, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7679 + }, + { + "epoch": 0.96, + "learning_rate": 6.972767459622387e-08, + "loss": 2.4884, + "loss_": 0.9969, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7686 + }, + { + "epoch": 0.96, + "learning_rate": 6.641709966000886e-08, + "loss": 2.5285, + "loss_": 0.9687, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3891, + "step": 7693 + }, + { + "epoch": 0.97, + "learning_rate": 6.318677679685081e-08, + "loss": 2.4968, + "loss_": 1.151, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7700 + }, + { + "epoch": 0.97, + "learning_rate": 6.003673210409067e-08, + "loss": 2.5292, + "loss_": 0.7981, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7707 + }, + { + "epoch": 0.97, + "learning_rate": 5.696699103051484e-08, + "loss": 2.5312, + "loss_": 1.1779, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7714 + }, + { + "epoch": 0.97, + "learning_rate": 5.3977578376144257e-08, + "loss": 2.5219, + "loss_": 1.0229, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3895, + "step": 7721 + }, + { + "epoch": 0.97, + "learning_rate": 5.1068518292042293e-08, + "loss": 2.5087, + "loss_": 1.0511, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3929, + "step": 7728 + }, + { + "epoch": 0.97, + "learning_rate": 4.823983428010936e-08, + "loss": 2.4938, + "loss_": 0.9396, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7735 + }, + { + "epoch": 0.97, + "learning_rate": 4.549154919290199e-08, + "loss": 2.554, + "loss_": 1.0694, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7742 + }, + { + "epoch": 0.97, + "learning_rate": 4.2823685233445155e-08, + "loss": 2.5225, + "loss_": 1.023, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7749 + }, + { + "epoch": 0.97, + "learning_rate": 4.0236263955049095e-08, + "loss": 2.4966, + "loss_": 1.2173, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7756 + }, + { + "epoch": 0.97, + "learning_rate": 3.7729306261141685e-08, + "loss": 2.5165, + "loss_": 1.0892, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7763 + }, + { + "epoch": 0.97, + "learning_rate": 3.530283240509414e-08, + "loss": 2.5096, + "loss_": 0.9228, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7770 + }, + { + "epoch": 0.97, + "learning_rate": 3.2956861990062203e-08, + "loss": 2.5389, + "loss_": 1.1298, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3891, + "step": 7777 + }, + { + "epoch": 0.98, + "learning_rate": 3.0691413968821915e-08, + "loss": 2.5103, + "loss_": 0.8584, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3891, + "step": 7784 + }, + { + "epoch": 0.98, + "learning_rate": 2.8506506643621866e-08, + "loss": 2.5083, + "loss_": 1.001, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7791 + }, + { + "epoch": 0.98, + "learning_rate": 2.6402157666034488e-08, + "loss": 2.5312, + "loss_": 1.1118, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7798 + }, + { + "epoch": 0.98, + "learning_rate": 2.4378384036808368e-08, + "loss": 2.4881, + "loss_": 1.061, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7805 + }, + { + "epoch": 0.98, + "learning_rate": 2.243520210573946e-08, + "loss": 2.529, + "loss_": 1.1615, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7812 + }, + { + "epoch": 0.98, + "learning_rate": 2.0572627571529e-08, + "loss": 2.5355, + "loss_": 1.0449, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7819 + }, + { + "epoch": 0.98, + "learning_rate": 1.8790675481666908e-08, + "loss": 2.5263, + "loss_": 0.9646, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7826 + }, + { + "epoch": 0.98, + "learning_rate": 1.70893602323019e-08, + "loss": 2.5028, + "loss_": 1.3031, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7833 + }, + { + "epoch": 0.98, + "learning_rate": 1.5468695568131576e-08, + "loss": 2.5335, + "loss_": 1.0215, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7840 + }, + { + "epoch": 0.98, + "learning_rate": 1.3928694582284741e-08, + "loss": 2.5138, + "loss_": 1.0019, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7847 + }, + { + "epoch": 0.98, + "learning_rate": 1.246936971622148e-08, + "loss": 2.5227, + "loss_": 1.2481, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3891, + "step": 7854 + }, + { + "epoch": 0.99, + "learning_rate": 1.1090732759631018e-08, + "loss": 2.5075, + "loss_": 1.1757, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3891, + "step": 7861 + }, + { + "epoch": 0.99, + "learning_rate": 9.79279485033402e-09, + "loss": 2.524, + "loss_": 1.3905, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7868 + }, + { + "epoch": 0.99, + "learning_rate": 8.575566474195996e-09, + "loss": 2.5326, + "loss_": 1.1002, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7875 + }, + { + "epoch": 0.99, + "learning_rate": 7.43905746503959e-09, + "loss": 2.5051, + "loss_": 0.7085, + "moe_loss": 0.1602, + "moe_loss_longrong": 1.3925, + "step": 7882 + }, + { + "epoch": 0.99, + "learning_rate": 6.383277004569088e-09, + "loss": 2.5389, + "loss_": 1.1412, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7889 + }, + { + "epoch": 0.99, + "learning_rate": 5.408233622289371e-09, + "loss": 2.5165, + "loss_": 0.9492, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7896 + }, + { + "epoch": 0.99, + "learning_rate": 4.513935195445962e-09, + "loss": 2.5191, + "loss_": 0.9539, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3894, + "step": 7903 + }, + { + "epoch": 0.99, + "learning_rate": 3.7003889489550806e-09, + "loss": 2.4844, + "loss_": 0.9553, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7910 + }, + { + "epoch": 0.99, + "learning_rate": 2.9676014553459145e-09, + "loss": 2.5133, + "loss_": 0.8643, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3891, + "step": 7917 + }, + { + "epoch": 0.99, + "learning_rate": 2.315578634710658e-09, + "loss": 2.4846, + "loss_": 1.0907, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3891, + "step": 7924 + }, + { + "epoch": 0.99, + "learning_rate": 1.7443257546512215e-09, + "loss": 2.4985, + "loss_": 0.8135, + "moe_loss": 0.1601, + "moe_loss_longrong": 1.3927, + "step": 7931 + }, + { + "epoch": 1.0, + "learning_rate": 1.2538474302459246e-09, + "loss": 2.5193, + "loss_": 1.0192, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7938 + }, + { + "epoch": 1.0, + "learning_rate": 8.441476239995361e-10, + "loss": 2.5156, + "loss_": 0.857, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7945 + }, + { + "epoch": 1.0, + "learning_rate": 5.152296458232897e-10, + "loss": 2.5293, + "loss_": 1.1247, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7952 + }, + { + "epoch": 1.0, + "learning_rate": 2.6709615299935763e-10, + "loss": 2.5155, + "loss_": 0.9838, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7959 + }, + { + "epoch": 1.0, + "learning_rate": 9.97491501675274e-11, + "loss": 2.5037, + "loss_": 1.1665, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3892, + "step": 7966 + }, + { + "epoch": 1.0, + "learning_rate": 1.3189989298556527e-11, + "loss": 2.5389, + "loss_": 0.9918, + "moe_loss": 0.16, + "moe_loss_longrong": 1.3893, + "step": 7973 + }, + { + "epoch": 1.0, + "step": 7977, + "total_flos": 1.1960092486052872e+19, + "train_loss": 2.5803030949420793, + "train_runtime": 142162.8835, + "train_samples_per_second": 7.183, + "train_steps_per_second": 0.056 + } + ], + "logging_steps": 7, + "max_steps": 7977, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "total_flos": 1.1960092486052872e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}