{ "best_metric": 9.4224620303757, "best_model_checkpoint": "kotoba_v2_enc_logs_epoch2_2/checkpoint-500", "epoch": 0.0034266994716029417, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 3.4266994716029415e-05, "grad_norm": 1.0561553239822388, "learning_rate": 1e-05, "loss": 0.2361, "step": 10 }, { "epoch": 6.853398943205883e-05, "grad_norm": 1.1626238822937012, "learning_rate": 1e-05, "loss": 0.2265, "step": 20 }, { "epoch": 0.00010280098414808825, "grad_norm": 0.9845689535140991, "learning_rate": 1e-05, "loss": 0.2279, "step": 30 }, { "epoch": 0.00013706797886411766, "grad_norm": 1.142356276512146, "learning_rate": 1e-05, "loss": 0.2382, "step": 40 }, { "epoch": 0.00017133497358014707, "grad_norm": 1.0053240060806274, "learning_rate": 1e-05, "loss": 0.2473, "step": 50 }, { "epoch": 0.0002056019682961765, "grad_norm": 1.1098105907440186, "learning_rate": 1e-05, "loss": 0.2438, "step": 60 }, { "epoch": 0.0002398689630122059, "grad_norm": 1.191983699798584, "learning_rate": 1e-05, "loss": 0.2293, "step": 70 }, { "epoch": 0.0002741359577282353, "grad_norm": 1.1295104026794434, "learning_rate": 1e-05, "loss": 0.2362, "step": 80 }, { "epoch": 0.0003084029524442647, "grad_norm": 1.037972092628479, "learning_rate": 1e-05, "loss": 0.2455, "step": 90 }, { "epoch": 0.00034266994716029413, "grad_norm": 1.1975648403167725, "learning_rate": 1e-05, "loss": 0.2459, "step": 100 }, { "epoch": 0.00037693694187632354, "grad_norm": 1.0676342248916626, "learning_rate": 1e-05, "loss": 0.2271, "step": 110 }, { "epoch": 0.000411203936592353, "grad_norm": 1.0749495029449463, "learning_rate": 1e-05, "loss": 0.2417, "step": 120 }, { "epoch": 0.0004454709313083824, "grad_norm": 1.094260811805725, "learning_rate": 1e-05, "loss": 0.2354, "step": 130 }, { "epoch": 0.0004797379260244118, "grad_norm": 1.0395853519439697, "learning_rate": 1e-05, "loss": 0.2381, "step": 140 }, { "epoch": 0.0005140049207404412, "grad_norm": 1.2008885145187378, "learning_rate": 1e-05, "loss": 0.2354, "step": 150 }, { "epoch": 0.0005482719154564706, "grad_norm": 1.0647832155227661, "learning_rate": 1e-05, "loss": 0.2321, "step": 160 }, { "epoch": 0.0005825389101725, "grad_norm": 1.327071189880371, "learning_rate": 1e-05, "loss": 0.238, "step": 170 }, { "epoch": 0.0006168059048885295, "grad_norm": 1.1184055805206299, "learning_rate": 1e-05, "loss": 0.2242, "step": 180 }, { "epoch": 0.0006510728996045589, "grad_norm": 1.2512784004211426, "learning_rate": 1e-05, "loss": 0.2437, "step": 190 }, { "epoch": 0.0006853398943205883, "grad_norm": 1.0614465475082397, "learning_rate": 1e-05, "loss": 0.2382, "step": 200 }, { "epoch": 0.0007196068890366177, "grad_norm": 1.0607149600982666, "learning_rate": 1e-05, "loss": 0.2381, "step": 210 }, { "epoch": 0.0007538738837526471, "grad_norm": 1.0422028303146362, "learning_rate": 1e-05, "loss": 0.2294, "step": 220 }, { "epoch": 0.0007881408784686765, "grad_norm": 1.0162984132766724, "learning_rate": 1e-05, "loss": 0.2275, "step": 230 }, { "epoch": 0.000822407873184706, "grad_norm": 1.1085543632507324, "learning_rate": 1e-05, "loss": 0.2161, "step": 240 }, { "epoch": 0.0008566748679007354, "grad_norm": 1.1854636669158936, "learning_rate": 1e-05, "loss": 0.2382, "step": 250 }, { "epoch": 0.0008909418626167648, "grad_norm": 1.40137779712677, "learning_rate": 1e-05, "loss": 0.2579, "step": 260 }, { "epoch": 0.0009252088573327942, "grad_norm": 1.0814112424850464, "learning_rate": 1e-05, "loss": 0.2612, "step": 270 }, { "epoch": 0.0009594758520488236, "grad_norm": 1.083736538887024, "learning_rate": 1e-05, "loss": 0.2711, "step": 280 }, { "epoch": 0.000993742846764853, "grad_norm": 1.0861411094665527, "learning_rate": 1e-05, "loss": 0.2642, "step": 290 }, { "epoch": 0.0010280098414808825, "grad_norm": 1.1141265630722046, "learning_rate": 1e-05, "loss": 0.2585, "step": 300 }, { "epoch": 0.0010622768361969119, "grad_norm": 1.326241374015808, "learning_rate": 1e-05, "loss": 0.2858, "step": 310 }, { "epoch": 0.0010965438309129413, "grad_norm": 1.393750786781311, "learning_rate": 1e-05, "loss": 0.2635, "step": 320 }, { "epoch": 0.0011308108256289707, "grad_norm": 1.0851459503173828, "learning_rate": 1e-05, "loss": 0.2565, "step": 330 }, { "epoch": 0.001165077820345, "grad_norm": 1.2323757410049438, "learning_rate": 1e-05, "loss": 0.2465, "step": 340 }, { "epoch": 0.0011993448150610295, "grad_norm": 1.376953125, "learning_rate": 1e-05, "loss": 0.2671, "step": 350 }, { "epoch": 0.001233611809777059, "grad_norm": 1.084592580795288, "learning_rate": 1e-05, "loss": 0.2643, "step": 360 }, { "epoch": 0.0012678788044930883, "grad_norm": 1.2907005548477173, "learning_rate": 1e-05, "loss": 0.2584, "step": 370 }, { "epoch": 0.0013021457992091177, "grad_norm": 1.0698130130767822, "learning_rate": 1e-05, "loss": 0.2526, "step": 380 }, { "epoch": 0.0013364127939251471, "grad_norm": 1.1399807929992676, "learning_rate": 1e-05, "loss": 0.2759, "step": 390 }, { "epoch": 0.0013706797886411765, "grad_norm": 1.1480791568756104, "learning_rate": 1e-05, "loss": 0.2499, "step": 400 }, { "epoch": 0.001404946783357206, "grad_norm": 1.3095237016677856, "learning_rate": 1e-05, "loss": 0.2536, "step": 410 }, { "epoch": 0.0014392137780732353, "grad_norm": 1.068246841430664, "learning_rate": 1e-05, "loss": 0.2604, "step": 420 }, { "epoch": 0.0014734807727892648, "grad_norm": 1.2310419082641602, "learning_rate": 1e-05, "loss": 0.2632, "step": 430 }, { "epoch": 0.0015077477675052942, "grad_norm": 1.161867380142212, "learning_rate": 1e-05, "loss": 0.2584, "step": 440 }, { "epoch": 0.0015420147622213236, "grad_norm": 1.1461217403411865, "learning_rate": 1e-05, "loss": 0.2592, "step": 450 }, { "epoch": 0.001576281756937353, "grad_norm": 1.3006030321121216, "learning_rate": 1e-05, "loss": 0.2607, "step": 460 }, { "epoch": 0.0016105487516533824, "grad_norm": 1.1223125457763672, "learning_rate": 1e-05, "loss": 0.2433, "step": 470 }, { "epoch": 0.001644815746369412, "grad_norm": 1.2909380197525024, "learning_rate": 1e-05, "loss": 0.2693, "step": 480 }, { "epoch": 0.0016790827410854414, "grad_norm": 1.2270597219467163, "learning_rate": 1e-05, "loss": 0.2661, "step": 490 }, { "epoch": 0.0017133497358014708, "grad_norm": 1.1439770460128784, "learning_rate": 1e-05, "loss": 0.2517, "step": 500 }, { "epoch": 0.0017133497358014708, "eval_cer": 13.0358087846181, "eval_loss": 0.25224336981773376, "eval_normalized_cer": 9.4224620303757, "eval_runtime": 227.2174, "eval_samples_per_second": 2.253, "eval_steps_per_second": 0.035, "step": 500 }, { "epoch": 0.0017476167305175002, "grad_norm": 1.1377454996109009, "learning_rate": 1e-05, "loss": 0.2579, "step": 510 }, { "epoch": 0.0017818837252335296, "grad_norm": 1.2096498012542725, "learning_rate": 1e-05, "loss": 0.2727, "step": 520 }, { "epoch": 0.001816150719949559, "grad_norm": 1.187213659286499, "learning_rate": 1e-05, "loss": 0.2562, "step": 530 }, { "epoch": 0.0018504177146655885, "grad_norm": 0.969393253326416, "learning_rate": 1e-05, "loss": 0.2378, "step": 540 }, { "epoch": 0.0018846847093816179, "grad_norm": 0.9745528697967529, "learning_rate": 1e-05, "loss": 0.2774, "step": 550 }, { "epoch": 0.0019189517040976473, "grad_norm": 1.0725352764129639, "learning_rate": 1e-05, "loss": 0.2541, "step": 560 }, { "epoch": 0.0019532186988136767, "grad_norm": 1.217871904373169, "learning_rate": 1e-05, "loss": 0.2395, "step": 570 }, { "epoch": 0.001987485693529706, "grad_norm": 1.3582627773284912, "learning_rate": 1e-05, "loss": 0.2594, "step": 580 }, { "epoch": 0.0020217526882457355, "grad_norm": 1.2415379285812378, "learning_rate": 1e-05, "loss": 0.2582, "step": 590 }, { "epoch": 0.002056019682961765, "grad_norm": 0.9810131192207336, "learning_rate": 1e-05, "loss": 0.2284, "step": 600 }, { "epoch": 0.0020902866776777943, "grad_norm": 0.9806564450263977, "learning_rate": 1e-05, "loss": 0.2688, "step": 610 }, { "epoch": 0.0021245536723938237, "grad_norm": 1.2755467891693115, "learning_rate": 1e-05, "loss": 0.2591, "step": 620 }, { "epoch": 0.002158820667109853, "grad_norm": 0.9300326704978943, "learning_rate": 1e-05, "loss": 0.2444, "step": 630 }, { "epoch": 0.0021930876618258825, "grad_norm": 1.1276524066925049, "learning_rate": 1e-05, "loss": 0.236, "step": 640 }, { "epoch": 0.002227354656541912, "grad_norm": 1.1786876916885376, "learning_rate": 1e-05, "loss": 0.2443, "step": 650 }, { "epoch": 0.0022616216512579414, "grad_norm": 1.1702712774276733, "learning_rate": 1e-05, "loss": 0.2627, "step": 660 }, { "epoch": 0.0022958886459739708, "grad_norm": 1.2837899923324585, "learning_rate": 1e-05, "loss": 0.2378, "step": 670 }, { "epoch": 0.00233015564069, "grad_norm": 1.0623608827590942, "learning_rate": 1e-05, "loss": 0.2491, "step": 680 }, { "epoch": 0.0023644226354060296, "grad_norm": 1.1288243532180786, "learning_rate": 1e-05, "loss": 0.2773, "step": 690 }, { "epoch": 0.002398689630122059, "grad_norm": 1.0192692279815674, "learning_rate": 1e-05, "loss": 0.2492, "step": 700 }, { "epoch": 0.0024329566248380884, "grad_norm": 1.2274680137634277, "learning_rate": 1e-05, "loss": 0.2345, "step": 710 }, { "epoch": 0.002467223619554118, "grad_norm": 1.240645170211792, "learning_rate": 1e-05, "loss": 0.2624, "step": 720 }, { "epoch": 0.002501490614270147, "grad_norm": 1.0681366920471191, "learning_rate": 1e-05, "loss": 0.2553, "step": 730 }, { "epoch": 0.0025357576089861766, "grad_norm": 1.0161867141723633, "learning_rate": 1e-05, "loss": 0.2547, "step": 740 }, { "epoch": 0.002570024603702206, "grad_norm": 1.2384017705917358, "learning_rate": 1e-05, "loss": 0.2449, "step": 750 }, { "epoch": 0.0026042915984182354, "grad_norm": 1.1739261150360107, "learning_rate": 1e-05, "loss": 0.2523, "step": 760 }, { "epoch": 0.002638558593134265, "grad_norm": 1.0396535396575928, "learning_rate": 1e-05, "loss": 0.2535, "step": 770 }, { "epoch": 0.0026728255878502943, "grad_norm": 1.14767324924469, "learning_rate": 1e-05, "loss": 0.2594, "step": 780 }, { "epoch": 0.0027070925825663237, "grad_norm": 1.1783303022384644, "learning_rate": 1e-05, "loss": 0.2546, "step": 790 }, { "epoch": 0.002741359577282353, "grad_norm": 1.1065645217895508, "learning_rate": 1e-05, "loss": 0.2547, "step": 800 }, { "epoch": 0.0027756265719983825, "grad_norm": 1.256645917892456, "learning_rate": 1e-05, "loss": 0.2548, "step": 810 }, { "epoch": 0.002809893566714412, "grad_norm": 1.058158278465271, "learning_rate": 1e-05, "loss": 0.257, "step": 820 }, { "epoch": 0.0028441605614304413, "grad_norm": 1.0647656917572021, "learning_rate": 1e-05, "loss": 0.2479, "step": 830 }, { "epoch": 0.0028784275561464707, "grad_norm": 1.1984691619873047, "learning_rate": 1e-05, "loss": 0.2503, "step": 840 }, { "epoch": 0.0029126945508625, "grad_norm": 1.1380070447921753, "learning_rate": 1e-05, "loss": 0.245, "step": 850 }, { "epoch": 0.0029469615455785295, "grad_norm": 1.2131065130233765, "learning_rate": 1e-05, "loss": 0.242, "step": 860 }, { "epoch": 0.002981228540294559, "grad_norm": 1.1822234392166138, "learning_rate": 1e-05, "loss": 0.2613, "step": 870 }, { "epoch": 0.0030154955350105883, "grad_norm": 1.0591018199920654, "learning_rate": 1e-05, "loss": 0.2654, "step": 880 }, { "epoch": 0.0030497625297266177, "grad_norm": 1.2318428754806519, "learning_rate": 1e-05, "loss": 0.2525, "step": 890 }, { "epoch": 0.003084029524442647, "grad_norm": 1.0146839618682861, "learning_rate": 1e-05, "loss": 0.2609, "step": 900 }, { "epoch": 0.0031182965191586766, "grad_norm": 1.1508561372756958, "learning_rate": 1e-05, "loss": 0.2541, "step": 910 }, { "epoch": 0.003152563513874706, "grad_norm": 1.1494849920272827, "learning_rate": 1e-05, "loss": 0.2461, "step": 920 }, { "epoch": 0.0031868305085907354, "grad_norm": 1.2423807382583618, "learning_rate": 1e-05, "loss": 0.2573, "step": 930 }, { "epoch": 0.0032210975033067648, "grad_norm": 1.2714438438415527, "learning_rate": 1e-05, "loss": 0.2545, "step": 940 }, { "epoch": 0.0032553644980227946, "grad_norm": 1.2088007926940918, "learning_rate": 1e-05, "loss": 0.2773, "step": 950 }, { "epoch": 0.003289631492738824, "grad_norm": 1.0737963914871216, "learning_rate": 1e-05, "loss": 0.2495, "step": 960 }, { "epoch": 0.0033238984874548534, "grad_norm": 1.0942472219467163, "learning_rate": 1e-05, "loss": 0.2401, "step": 970 }, { "epoch": 0.003358165482170883, "grad_norm": 1.1282986402511597, "learning_rate": 1e-05, "loss": 0.2638, "step": 980 }, { "epoch": 0.0033924324768869123, "grad_norm": 1.0762425661087036, "learning_rate": 1e-05, "loss": 0.2619, "step": 990 }, { "epoch": 0.0034266994716029417, "grad_norm": 1.09200119972229, "learning_rate": 1e-05, "loss": 0.2464, "step": 1000 }, { "epoch": 0.0034266994716029417, "eval_cer": 13.80313988357735, "eval_loss": 0.25397512316703796, "eval_normalized_cer": 9.952038369304557, "eval_runtime": 227.5088, "eval_samples_per_second": 2.25, "eval_steps_per_second": 0.035, "step": 1000 } ], "logging_steps": 10, "max_steps": 291826, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.0275903070208e+20, "train_batch_size": 128, "trial_name": null, "trial_params": null }