{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.712550607287449, "eval_steps": 100, "global_step": 75, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06477732793522267, "grad_norm": 0.08484393358230591, "learning_rate": 2.5e-08, "loss": 0.693, "mean_token_accuracy": 0.8232172057032585, "step": 1 }, { "epoch": 0.12955465587044535, "grad_norm": 0.08104647696018219, "learning_rate": 5e-08, "loss": 0.6361, "mean_token_accuracy": 0.8352038823068142, "step": 2 }, { "epoch": 0.19433198380566802, "grad_norm": 0.08257265388965607, "learning_rate": 7.5e-08, "loss": 0.669, "mean_token_accuracy": 0.8248961418867111, "step": 3 }, { "epoch": 0.2591093117408907, "grad_norm": 0.08169891685247421, "learning_rate": 1e-07, "loss": 0.654, "mean_token_accuracy": 0.8317753672599792, "step": 4 }, { "epoch": 0.32388663967611336, "grad_norm": 0.07086937129497528, "learning_rate": 1.25e-07, "loss": 0.6029, "mean_token_accuracy": 0.8417873457074165, "step": 5 }, { "epoch": 0.38866396761133604, "grad_norm": 0.08497548848390579, "learning_rate": 1.5e-07, "loss": 0.6561, "mean_token_accuracy": 0.8321865610778332, "step": 6 }, { "epoch": 0.4534412955465587, "grad_norm": 0.07375866174697876, "learning_rate": 1.75e-07, "loss": 0.6029, "mean_token_accuracy": 0.8472057469189167, "step": 7 }, { "epoch": 0.5182186234817814, "grad_norm": 0.08089514076709747, "learning_rate": 2e-07, "loss": 0.6429, "mean_token_accuracy": 0.8303790092468262, "step": 8 }, { "epoch": 0.582995951417004, "grad_norm": 0.07564779371023178, "learning_rate": 1.9989008914857112e-07, "loss": 0.6453, "mean_token_accuracy": 0.8341928347945213, "step": 9 }, { "epoch": 0.6477732793522267, "grad_norm": 0.08072903752326965, "learning_rate": 1.995605982021898e-07, "loss": 0.6476, "mean_token_accuracy": 0.8313158191740513, "step": 10 }, { "epoch": 0.7125506072874493, "grad_norm": 0.08074145019054413, "learning_rate": 1.9901225145346506e-07, "loss": 0.6519, "mean_token_accuracy": 0.8337250761687756, "step": 11 }, { "epoch": 0.7773279352226721, "grad_norm": 0.08061974495649338, "learning_rate": 1.9824625428755758e-07, "loss": 0.651, "mean_token_accuracy": 0.8378880433738232, "step": 12 }, { "epoch": 0.8421052631578947, "grad_norm": 0.0789395272731781, "learning_rate": 1.9726429053248129e-07, "loss": 0.6443, "mean_token_accuracy": 0.8334936276078224, "step": 13 }, { "epoch": 0.9068825910931174, "grad_norm": 0.07814885675907135, "learning_rate": 1.96068518757684e-07, "loss": 0.6565, "mean_token_accuracy": 0.8271612599492073, "step": 14 }, { "epoch": 0.97165991902834, "grad_norm": 0.07767179608345032, "learning_rate": 1.946615675290434e-07, "loss": 0.6348, "mean_token_accuracy": 0.8349605351686478, "step": 15 }, { "epoch": 1.0, "grad_norm": 0.07767179608345032, "learning_rate": 1.9304652963070865e-07, "loss": 0.6711, "mean_token_accuracy": 0.8187758156231472, "step": 16 }, { "epoch": 1.0647773279352226, "grad_norm": 0.1275881677865982, "learning_rate": 1.9122695526648966e-07, "loss": 0.6142, "mean_token_accuracy": 0.838625643402338, "step": 17 }, { "epoch": 1.1295546558704452, "grad_norm": 0.07696238905191422, "learning_rate": 1.8920684425573862e-07, "loss": 0.6289, "mean_token_accuracy": 0.835904911160469, "step": 18 }, { "epoch": 1.194331983805668, "grad_norm": 0.08134917914867401, "learning_rate": 1.8699063724087903e-07, "loss": 0.6511, "mean_token_accuracy": 0.830647598952055, "step": 19 }, { "epoch": 1.2591093117408907, "grad_norm": 0.08185581117868423, "learning_rate": 1.8458320592590972e-07, "loss": 0.6395, "mean_token_accuracy": 0.8346012011170387, "step": 20 }, { "epoch": 1.3238866396761133, "grad_norm": 0.07670310139656067, "learning_rate": 1.8198984236734245e-07, "loss": 0.5885, "mean_token_accuracy": 0.8427848629653454, "step": 21 }, { "epoch": 1.3886639676113361, "grad_norm": 0.0753348097205162, "learning_rate": 1.792162473411129e-07, "loss": 0.6299, "mean_token_accuracy": 0.834791149944067, "step": 22 }, { "epoch": 1.4534412955465588, "grad_norm": 0.07955452054738998, "learning_rate": 1.7626851781103817e-07, "loss": 0.6499, "mean_token_accuracy": 0.8349028266966343, "step": 23 }, { "epoch": 1.5182186234817814, "grad_norm": 0.07731788605451584, "learning_rate": 1.731531335263669e-07, "loss": 0.6252, "mean_token_accuracy": 0.8352472670376301, "step": 24 }, { "epoch": 1.582995951417004, "grad_norm": 0.07537000626325607, "learning_rate": 1.6987694277788415e-07, "loss": 0.6405, "mean_token_accuracy": 0.8319784663617611, "step": 25 }, { "epoch": 1.6477732793522266, "grad_norm": 0.08128884434700012, "learning_rate": 1.6644714734388215e-07, "loss": 0.6501, "mean_token_accuracy": 0.8315212316811085, "step": 26 }, { "epoch": 1.7125506072874495, "grad_norm": 0.08110049366950989, "learning_rate": 1.628712866590885e-07, "loss": 0.6727, "mean_token_accuracy": 0.8280021287500858, "step": 27 }, { "epoch": 1.777327935222672, "grad_norm": 0.08107449114322662, "learning_rate": 1.5915722124135225e-07, "loss": 0.6888, "mean_token_accuracy": 0.8243995904922485, "step": 28 }, { "epoch": 1.8421052631578947, "grad_norm": 0.08128422498703003, "learning_rate": 1.5531311541251992e-07, "loss": 0.6592, "mean_token_accuracy": 0.8349018841981888, "step": 29 }, { "epoch": 1.9068825910931175, "grad_norm": 0.08368204534053802, "learning_rate": 1.5134741935148418e-07, "loss": 0.6931, "mean_token_accuracy": 0.8207659162580967, "step": 30 }, { "epoch": 1.97165991902834, "grad_norm": 0.08243861049413681, "learning_rate": 1.4726885051885652e-07, "loss": 0.677, "mean_token_accuracy": 0.829660214483738, "step": 31 }, { "epoch": 2.0, "grad_norm": 0.08243861049413681, "learning_rate": 1.4308637449409705e-07, "loss": 0.6144, "mean_token_accuracy": 0.8387703725269863, "step": 32 }, { "epoch": 2.064777327935223, "grad_norm": 0.1279604583978653, "learning_rate": 1.3880918526722496e-07, "loss": 0.6332, "mean_token_accuracy": 0.8366866521537304, "step": 33 }, { "epoch": 2.1295546558704452, "grad_norm": 0.08055976778268814, "learning_rate": 1.344466850284333e-07, "loss": 0.6709, "mean_token_accuracy": 0.8283472806215286, "step": 34 }, { "epoch": 2.194331983805668, "grad_norm": 0.08526450395584106, "learning_rate": 1.3000846350003406e-07, "loss": 0.6429, "mean_token_accuracy": 0.8317065984010696, "step": 35 }, { "epoch": 2.2591093117408905, "grad_norm": 0.07659583538770676, "learning_rate": 1.2550427685616765e-07, "loss": 0.6523, "mean_token_accuracy": 0.8286443240940571, "step": 36 }, { "epoch": 2.3238866396761133, "grad_norm": 0.07898923009634018, "learning_rate": 1.2094402627661445e-07, "loss": 0.6219, "mean_token_accuracy": 0.8340383470058441, "step": 37 }, { "epoch": 2.388663967611336, "grad_norm": 0.07619204372167587, "learning_rate": 1.1633773618185301e-07, "loss": 0.6139, "mean_token_accuracy": 0.8424343690276146, "step": 38 }, { "epoch": 2.4534412955465585, "grad_norm": 0.07402335107326508, "learning_rate": 1.1169553219720827e-07, "loss": 0.6798, "mean_token_accuracy": 0.8285421878099442, "step": 39 }, { "epoch": 2.5182186234817814, "grad_norm": 0.08590105175971985, "learning_rate": 1.0702761889452929e-07, "loss": 0.6632, "mean_token_accuracy": 0.8267885185778141, "step": 40 }, { "epoch": 2.582995951417004, "grad_norm": 0.08060938864946365, "learning_rate": 1.0234425736032605e-07, "loss": 0.6725, "mean_token_accuracy": 0.8298077434301376, "step": 41 }, { "epoch": 2.6477732793522266, "grad_norm": 0.08330174535512924, "learning_rate": 9.765574263967395e-08, "loss": 0.6404, "mean_token_accuracy": 0.8322437591850758, "step": 42 }, { "epoch": 2.7125506072874495, "grad_norm": 0.0802123174071312, "learning_rate": 9.297238110547073e-08, "loss": 0.6333, "mean_token_accuracy": 0.8358821533620358, "step": 43 }, { "epoch": 2.7773279352226723, "grad_norm": 0.07823917269706726, "learning_rate": 8.830446780279175e-08, "loss": 0.633, "mean_token_accuracy": 0.8377866670489311, "step": 44 }, { "epoch": 2.8421052631578947, "grad_norm": 0.07679135352373123, "learning_rate": 8.366226381814697e-08, "loss": 0.6263, "mean_token_accuracy": 0.8360908254981041, "step": 45 }, { "epoch": 2.9068825910931175, "grad_norm": 0.07800301164388657, "learning_rate": 7.905597372338558e-08, "loss": 0.6464, "mean_token_accuracy": 0.8296581134200096, "step": 46 }, { "epoch": 2.97165991902834, "grad_norm": 0.07923697680234909, "learning_rate": 7.449572314383236e-08, "loss": 0.6825, "mean_token_accuracy": 0.8269040808081627, "step": 47 }, { "epoch": 3.0, "grad_norm": 0.096576027572155, "learning_rate": 6.999153649996594e-08, "loss": 0.6081, "mean_token_accuracy": 0.8382441401481628, "step": 48 }, { "epoch": 3.064777327935223, "grad_norm": 0.10343178361654282, "learning_rate": 6.555331497156671e-08, "loss": 0.6464, "mean_token_accuracy": 0.8325350135564804, "step": 49 }, { "epoch": 3.1295546558704452, "grad_norm": 0.07666892558336258, "learning_rate": 6.119081473277501e-08, "loss": 0.6379, "mean_token_accuracy": 0.8326343894004822, "step": 50 }, { "epoch": 3.194331983805668, "grad_norm": 0.07846437394618988, "learning_rate": 5.691362550590296e-08, "loss": 0.6438, "mean_token_accuracy": 0.8362045586109161, "step": 51 }, { "epoch": 3.2591093117408905, "grad_norm": 0.07778104394674301, "learning_rate": 5.2731149481143456e-08, "loss": 0.6296, "mean_token_accuracy": 0.8366052508354187, "step": 52 }, { "epoch": 3.3238866396761133, "grad_norm": 0.07893098890781403, "learning_rate": 4.8652580648515785e-08, "loss": 0.6426, "mean_token_accuracy": 0.83216942101717, "step": 53 }, { "epoch": 3.388663967611336, "grad_norm": 0.08473635464906693, "learning_rate": 4.4686884587480054e-08, "loss": 0.6913, "mean_token_accuracy": 0.8237782865762711, "step": 54 }, { "epoch": 3.4534412955465585, "grad_norm": 0.08035323023796082, "learning_rate": 4.084277875864776e-08, "loss": 0.635, "mean_token_accuracy": 0.8352065198123455, "step": 55 }, { "epoch": 3.5182186234817814, "grad_norm": 0.0813593938946724, "learning_rate": 3.712871334091153e-08, "loss": 0.6778, "mean_token_accuracy": 0.8271778710186481, "step": 56 }, { "epoch": 3.582995951417004, "grad_norm": 0.07975359261035919, "learning_rate": 3.355285265611784e-08, "loss": 0.632, "mean_token_accuracy": 0.8359011709690094, "step": 57 }, { "epoch": 3.6477732793522266, "grad_norm": 0.07455204427242279, "learning_rate": 3.0123057222115835e-08, "loss": 0.6287, "mean_token_accuracy": 0.8337721861898899, "step": 58 }, { "epoch": 3.7125506072874495, "grad_norm": 0.07816082239151001, "learning_rate": 2.6846866473633123e-08, "loss": 0.6399, "mean_token_accuracy": 0.835223838686943, "step": 59 }, { "epoch": 3.7773279352226723, "grad_norm": 0.08226487785577774, "learning_rate": 2.3731482188961815e-08, "loss": 0.6772, "mean_token_accuracy": 0.8268170021474361, "step": 60 }, { "epoch": 3.8421052631578947, "grad_norm": 0.08300112187862396, "learning_rate": 2.0783752658887067e-08, "loss": 0.6465, "mean_token_accuracy": 0.8339819870889187, "step": 61 }, { "epoch": 3.9068825910931175, "grad_norm": 0.08047865331172943, "learning_rate": 1.801015763265754e-08, "loss": 0.6524, "mean_token_accuracy": 0.8351892940700054, "step": 62 }, { "epoch": 3.97165991902834, "grad_norm": 0.07589241862297058, "learning_rate": 1.5416794074090255e-08, "loss": 0.598, "mean_token_accuracy": 0.8409505113959312, "step": 63 }, { "epoch": 4.0, "grad_norm": 0.07589241862297058, "learning_rate": 1.3009362759120978e-08, "loss": 0.6815, "mean_token_accuracy": 0.8250831876482282, "step": 64 }, { "epoch": 4.064777327935222, "grad_norm": 0.12950177490711212, "learning_rate": 1.079315574426135e-08, "loss": 0.6328, "mean_token_accuracy": 0.836986843496561, "step": 65 }, { "epoch": 4.129554655870446, "grad_norm": 0.07754851877689362, "learning_rate": 8.773044733510337e-09, "loss": 0.6614, "mean_token_accuracy": 0.829079158604145, "step": 66 }, { "epoch": 4.194331983805668, "grad_norm": 0.07672492414712906, "learning_rate": 6.953470369291348e-09, "loss": 0.6493, "mean_token_accuracy": 0.8342937044799328, "step": 67 }, { "epoch": 4.2591093117408905, "grad_norm": 0.0766540914773941, "learning_rate": 5.338432470956589e-09, "loss": 0.6079, "mean_token_accuracy": 0.8416761197149754, "step": 68 }, { "epoch": 4.323886639676114, "grad_norm": 0.07487937808036804, "learning_rate": 3.931481242315993e-09, "loss": 0.6135, "mean_token_accuracy": 0.839394424110651, "step": 69 }, { "epoch": 4.388663967611336, "grad_norm": 0.07662676274776459, "learning_rate": 2.7357094675186986e-09, "loss": 0.6908, "mean_token_accuracy": 0.8233718760311604, "step": 70 }, { "epoch": 4.4534412955465585, "grad_norm": 0.08624370396137238, "learning_rate": 1.7537457124423893e-09, "loss": 0.649, "mean_token_accuracy": 0.8329134620726109, "step": 71 }, { "epoch": 4.518218623481781, "grad_norm": 0.08158352971076965, "learning_rate": 9.877485465349057e-10, "loss": 0.646, "mean_token_accuracy": 0.8340884000062943, "step": 72 }, { "epoch": 4.582995951417004, "grad_norm": 0.08172011375427246, "learning_rate": 4.394017978101905e-10, "loss": 0.6344, "mean_token_accuracy": 0.8312402181327343, "step": 73 }, { "epoch": 4.647773279352227, "grad_norm": 0.07922390103340149, "learning_rate": 1.0991085142886269e-10, "loss": 0.666, "mean_token_accuracy": 0.8276109620928764, "step": 74 }, { "epoch": 4.712550607287449, "grad_norm": 0.08142900466918945, "learning_rate": 0.0, "loss": 0.659, "mean_token_accuracy": 0.8328934013843536, "step": 75 }, { "epoch": 4.712550607287449, "step": 75, "total_flos": 3.216438415536947e+16, "train_loss": 0.6463175455729167, "train_runtime": 630.4846, "train_samples_per_second": 15.663, "train_steps_per_second": 0.119 } ], "logging_steps": 1, "max_steps": 75, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 1, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.216438415536947e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }