{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.114616497829233, "eval_steps": 50, "global_step": 99, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011577424023154848, "grad_norm": 0.2338106632232666, "learning_rate": 5e-05, "loss": 1.067, "step": 1 }, { "epoch": 0.0011577424023154848, "eval_loss": 1.1207107305526733, "eval_runtime": 12.9724, "eval_samples_per_second": 28.06, "eval_steps_per_second": 14.03, "step": 1 }, { "epoch": 0.0023154848046309695, "grad_norm": 0.30521467328071594, "learning_rate": 0.0001, "loss": 1.209, "step": 2 }, { "epoch": 0.0034732272069464545, "grad_norm": 0.27888786792755127, "learning_rate": 0.00015, "loss": 1.317, "step": 3 }, { "epoch": 0.004630969609261939, "grad_norm": 0.3030368685722351, "learning_rate": 0.0002, "loss": 0.9915, "step": 4 }, { "epoch": 0.005788712011577424, "grad_norm": 0.3091076612472534, "learning_rate": 0.00025, "loss": 1.1124, "step": 5 }, { "epoch": 0.006946454413892909, "grad_norm": 0.49404653906822205, "learning_rate": 0.0003, "loss": 1.1443, "step": 6 }, { "epoch": 0.008104196816208394, "grad_norm": 0.45804426074028015, "learning_rate": 0.00035, "loss": 0.848, "step": 7 }, { "epoch": 0.009261939218523878, "grad_norm": 0.42264845967292786, "learning_rate": 0.0004, "loss": 0.8745, "step": 8 }, { "epoch": 0.010419681620839363, "grad_norm": 0.4398542642593384, "learning_rate": 0.00045000000000000004, "loss": 0.7968, "step": 9 }, { "epoch": 0.011577424023154847, "grad_norm": 0.5333066582679749, "learning_rate": 0.0005, "loss": 0.6171, "step": 10 }, { "epoch": 0.012735166425470333, "grad_norm": 0.6457921862602234, "learning_rate": 0.0004998442655654946, "loss": 0.6853, "step": 11 }, { "epoch": 0.013892908827785818, "grad_norm": 0.6219100952148438, "learning_rate": 0.0004993772562876909, "loss": 0.8119, "step": 12 }, { "epoch": 0.015050651230101303, "grad_norm": 0.4639165997505188, "learning_rate": 0.0004985995540019955, "loss": 0.5416, "step": 13 }, { "epoch": 0.016208393632416787, "grad_norm": 0.4277282655239105, "learning_rate": 0.0004975121276286136, "loss": 0.5533, "step": 14 }, { "epoch": 0.017366136034732273, "grad_norm": 0.5572855472564697, "learning_rate": 0.0004961163319653958, "loss": 0.7072, "step": 15 }, { "epoch": 0.018523878437047756, "grad_norm": 0.4737989604473114, "learning_rate": 0.0004944139059999286, "loss": 0.6697, "step": 16 }, { "epoch": 0.019681620839363242, "grad_norm": 0.4247514009475708, "learning_rate": 0.000492406970742972, "loss": 0.5848, "step": 17 }, { "epoch": 0.020839363241678725, "grad_norm": 0.40353336930274963, "learning_rate": 0.0004900980265859448, "loss": 0.6247, "step": 18 }, { "epoch": 0.02199710564399421, "grad_norm": 0.426436185836792, "learning_rate": 0.0004874899501857477, "loss": 0.608, "step": 19 }, { "epoch": 0.023154848046309694, "grad_norm": 0.49942946434020996, "learning_rate": 0.00048458599088080736, "loss": 0.671, "step": 20 }, { "epoch": 0.02431259044862518, "grad_norm": 0.39419907331466675, "learning_rate": 0.0004813897666428053, "loss": 0.5482, "step": 21 }, { "epoch": 0.025470332850940667, "grad_norm": 0.36672383546829224, "learning_rate": 0.00047790525956913543, "loss": 0.5659, "step": 22 }, { "epoch": 0.02662807525325615, "grad_norm": 0.3333776891231537, "learning_rate": 0.0004741368109217071, "loss": 0.5094, "step": 23 }, { "epoch": 0.027785817655571636, "grad_norm": 0.3671005666255951, "learning_rate": 0.00047008911571827283, "loss": 0.5006, "step": 24 }, { "epoch": 0.02894356005788712, "grad_norm": 0.2863791882991791, "learning_rate": 0.00046576721688302105, "loss": 0.4838, "step": 25 }, { "epoch": 0.030101302460202605, "grad_norm": 0.37180379033088684, "learning_rate": 0.0004611764989637205, "loss": 0.447, "step": 26 }, { "epoch": 0.03125904486251809, "grad_norm": 0.3161357343196869, "learning_rate": 0.0004563226814232444, "loss": 0.468, "step": 27 }, { "epoch": 0.032416787264833574, "grad_norm": 0.42159467935562134, "learning_rate": 0.0004512118115138315, "loss": 0.5334, "step": 28 }, { "epoch": 0.03357452966714906, "grad_norm": 0.3421444892883301, "learning_rate": 0.0004458502567429631, "loss": 0.4989, "step": 29 }, { "epoch": 0.03473227206946455, "grad_norm": 0.37070757150650024, "learning_rate": 0.00044024469694024196, "loss": 0.4877, "step": 30 }, { "epoch": 0.03589001447178003, "grad_norm": 0.3389771580696106, "learning_rate": 0.00043440211593515554, "loss": 0.454, "step": 31 }, { "epoch": 0.03704775687409551, "grad_norm": 0.3566928803920746, "learning_rate": 0.0004283297928560951, "loss": 0.6784, "step": 32 }, { "epoch": 0.038205499276410995, "grad_norm": 0.2971787452697754, "learning_rate": 0.0004220352930614672, "loss": 0.5285, "step": 33 }, { "epoch": 0.039363241678726485, "grad_norm": 0.3165189027786255, "learning_rate": 0.00041552645871420013, "loss": 0.4749, "step": 34 }, { "epoch": 0.04052098408104197, "grad_norm": 0.2689637839794159, "learning_rate": 0.00040881139901138467, "loss": 0.441, "step": 35 }, { "epoch": 0.04167872648335745, "grad_norm": 0.3806246221065521, "learning_rate": 0.00040189848008122475, "loss": 0.537, "step": 36 }, { "epoch": 0.04283646888567294, "grad_norm": 0.2700049579143524, "learning_rate": 0.00039479631455988334, "loss": 0.5311, "step": 37 }, { "epoch": 0.04399421128798842, "grad_norm": 0.3233337998390198, "learning_rate": 0.0003875137508612103, "loss": 0.4348, "step": 38 }, { "epoch": 0.045151953690303906, "grad_norm": 0.2585344612598419, "learning_rate": 0.00038005986215272055, "loss": 0.4228, "step": 39 }, { "epoch": 0.04630969609261939, "grad_norm": 0.2899521589279175, "learning_rate": 0.0003724439350515571, "loss": 0.4575, "step": 40 }, { "epoch": 0.04746743849493488, "grad_norm": 0.5732170343399048, "learning_rate": 0.0003646754580545226, "loss": 0.4874, "step": 41 }, { "epoch": 0.04862518089725036, "grad_norm": 0.2810184955596924, "learning_rate": 0.000356764109716594, "loss": 0.4141, "step": 42 }, { "epoch": 0.049782923299565844, "grad_norm": 0.3357601463794708, "learning_rate": 0.00034871974659264783, "loss": 0.5582, "step": 43 }, { "epoch": 0.050940665701881334, "grad_norm": 0.299430787563324, "learning_rate": 0.0003405523909574206, "loss": 0.4751, "step": 44 }, { "epoch": 0.05209840810419682, "grad_norm": 0.2595735490322113, "learning_rate": 0.0003322722183190025, "loss": 0.3575, "step": 45 }, { "epoch": 0.0532561505065123, "grad_norm": 0.32403305172920227, "learning_rate": 0.0003238895447414211, "loss": 0.4585, "step": 46 }, { "epoch": 0.05441389290882779, "grad_norm": 0.3115895390510559, "learning_rate": 0.0003154148139921102, "loss": 0.6248, "step": 47 }, { "epoch": 0.05557163531114327, "grad_norm": 0.26747116446495056, "learning_rate": 0.00030685858453027663, "loss": 0.4572, "step": 48 }, { "epoch": 0.056729377713458755, "grad_norm": 0.24940979480743408, "learning_rate": 0.0002982315163523742, "loss": 0.3957, "step": 49 }, { "epoch": 0.05788712011577424, "grad_norm": 0.31252002716064453, "learning_rate": 0.000289544357711076, "loss": 0.3606, "step": 50 }, { "epoch": 0.05788712011577424, "eval_loss": 0.4854944944381714, "eval_runtime": 12.8574, "eval_samples_per_second": 28.311, "eval_steps_per_second": 14.155, "step": 50 }, { "epoch": 0.05904486251808973, "grad_norm": 0.26592421531677246, "learning_rate": 0.0002808079317242896, "loss": 0.5123, "step": 51 }, { "epoch": 0.06020260492040521, "grad_norm": 0.3215560019016266, "learning_rate": 0.0002720331228909005, "loss": 0.6261, "step": 52 }, { "epoch": 0.06136034732272069, "grad_norm": 0.2619563043117523, "learning_rate": 0.00026323086353004075, "loss": 0.4538, "step": 53 }, { "epoch": 0.06251808972503618, "grad_norm": 0.25585314631462097, "learning_rate": 0.0002544121201607822, "loss": 0.4621, "step": 54 }, { "epoch": 0.06367583212735166, "grad_norm": 0.2522573471069336, "learning_rate": 0.00024558787983921783, "loss": 0.4833, "step": 55 }, { "epoch": 0.06483357452966715, "grad_norm": 0.27701646089553833, "learning_rate": 0.0002367691364699592, "loss": 0.4159, "step": 56 }, { "epoch": 0.06599131693198264, "grad_norm": 0.33121973276138306, "learning_rate": 0.00022796687710909964, "loss": 0.5848, "step": 57 }, { "epoch": 0.06714905933429811, "grad_norm": 0.27029693126678467, "learning_rate": 0.00021919206827571036, "loss": 0.4566, "step": 58 }, { "epoch": 0.0683068017366136, "grad_norm": 0.3517721891403198, "learning_rate": 0.00021045564228892402, "loss": 0.4093, "step": 59 }, { "epoch": 0.0694645441389291, "grad_norm": 0.3207434117794037, "learning_rate": 0.00020176848364762578, "loss": 0.432, "step": 60 }, { "epoch": 0.07062228654124457, "grad_norm": 0.25897809863090515, "learning_rate": 0.00019314141546972343, "loss": 0.4708, "step": 61 }, { "epoch": 0.07178002894356006, "grad_norm": 0.34539109468460083, "learning_rate": 0.00018458518600788986, "loss": 0.4638, "step": 62 }, { "epoch": 0.07293777134587555, "grad_norm": 0.24760332703590393, "learning_rate": 0.00017611045525857898, "loss": 0.3803, "step": 63 }, { "epoch": 0.07409551374819102, "grad_norm": 0.26825204491615295, "learning_rate": 0.0001677277816809975, "loss": 0.5163, "step": 64 }, { "epoch": 0.07525325615050651, "grad_norm": 0.43493548035621643, "learning_rate": 0.00015944760904257942, "loss": 0.5339, "step": 65 }, { "epoch": 0.07641099855282199, "grad_norm": 0.26649484038352966, "learning_rate": 0.0001512802534073522, "loss": 0.4251, "step": 66 }, { "epoch": 0.07756874095513748, "grad_norm": 0.273527055978775, "learning_rate": 0.00014323589028340596, "loss": 0.4912, "step": 67 }, { "epoch": 0.07872648335745297, "grad_norm": 0.3016795516014099, "learning_rate": 0.00013532454194547733, "loss": 0.4472, "step": 68 }, { "epoch": 0.07988422575976845, "grad_norm": 0.31133025884628296, "learning_rate": 0.00012755606494844294, "loss": 0.5907, "step": 69 }, { "epoch": 0.08104196816208394, "grad_norm": 0.3809708058834076, "learning_rate": 0.00011994013784727947, "loss": 0.586, "step": 70 }, { "epoch": 0.08219971056439943, "grad_norm": 0.402822881937027, "learning_rate": 0.00011248624913878966, "loss": 0.6835, "step": 71 }, { "epoch": 0.0833574529667149, "grad_norm": 0.2550395131111145, "learning_rate": 0.0001052036854401166, "loss": 0.2994, "step": 72 }, { "epoch": 0.08451519536903039, "grad_norm": 0.2937041223049164, "learning_rate": 9.810151991877531e-05, "loss": 0.507, "step": 73 }, { "epoch": 0.08567293777134588, "grad_norm": 0.3934008479118347, "learning_rate": 9.118860098861537e-05, "loss": 0.4647, "step": 74 }, { "epoch": 0.08683068017366136, "grad_norm": 0.31899431347846985, "learning_rate": 8.44735412857999e-05, "loss": 0.3375, "step": 75 }, { "epoch": 0.08798842257597685, "grad_norm": 0.5414046049118042, "learning_rate": 7.79647069385328e-05, "loss": 0.4782, "step": 76 }, { "epoch": 0.08914616497829234, "grad_norm": 0.2682500183582306, "learning_rate": 7.167020714390501e-05, "loss": 0.422, "step": 77 }, { "epoch": 0.09030390738060781, "grad_norm": 0.24800242483615875, "learning_rate": 6.559788406484446e-05, "loss": 0.4617, "step": 78 }, { "epoch": 0.0914616497829233, "grad_norm": 0.361648827791214, "learning_rate": 5.975530305975807e-05, "loss": 0.6129, "step": 79 }, { "epoch": 0.09261939218523878, "grad_norm": 0.2895083725452423, "learning_rate": 5.414974325703686e-05, "loss": 0.5065, "step": 80 }, { "epoch": 0.09377713458755427, "grad_norm": 0.26388344168663025, "learning_rate": 4.8788188486168616e-05, "loss": 0.5139, "step": 81 }, { "epoch": 0.09493487698986976, "grad_norm": 0.23608814179897308, "learning_rate": 4.367731857675569e-05, "loss": 0.4606, "step": 82 }, { "epoch": 0.09609261939218523, "grad_norm": 0.2052265703678131, "learning_rate": 3.882350103627952e-05, "loss": 0.3697, "step": 83 }, { "epoch": 0.09725036179450072, "grad_norm": 0.24611227214336395, "learning_rate": 3.423278311697897e-05, "loss": 0.5282, "step": 84 }, { "epoch": 0.09840810419681621, "grad_norm": 0.23457454144954681, "learning_rate": 2.9910884281727225e-05, "loss": 0.4415, "step": 85 }, { "epoch": 0.09956584659913169, "grad_norm": 0.26964303851127625, "learning_rate": 2.586318907829291e-05, "loss": 0.3641, "step": 86 }, { "epoch": 0.10072358900144718, "grad_norm": 0.3212110996246338, "learning_rate": 2.209474043086457e-05, "loss": 0.4868, "step": 87 }, { "epoch": 0.10188133140376267, "grad_norm": 0.38655808568000793, "learning_rate": 1.861023335719475e-05, "loss": 0.5066, "step": 88 }, { "epoch": 0.10303907380607814, "grad_norm": 0.25838810205459595, "learning_rate": 1.5414009119192633e-05, "loss": 0.372, "step": 89 }, { "epoch": 0.10419681620839363, "grad_norm": 0.3056221604347229, "learning_rate": 1.25100498142523e-05, "loss": 0.4282, "step": 90 }, { "epoch": 0.10535455861070912, "grad_norm": 0.20704999566078186, "learning_rate": 9.901973414055187e-06, "loss": 0.3774, "step": 91 }, { "epoch": 0.1065123010130246, "grad_norm": 0.24676023423671722, "learning_rate": 7.593029257027956e-06, "loss": 0.3745, "step": 92 }, { "epoch": 0.10767004341534009, "grad_norm": 0.30260273814201355, "learning_rate": 5.5860940000714015e-06, "loss": 0.4059, "step": 93 }, { "epoch": 0.10882778581765558, "grad_norm": 0.2307083010673523, "learning_rate": 3.8836680346041594e-06, "loss": 0.3894, "step": 94 }, { "epoch": 0.10998552821997105, "grad_norm": 0.3418976068496704, "learning_rate": 2.487872371386424e-06, "loss": 0.5039, "step": 95 }, { "epoch": 0.11114327062228654, "grad_norm": 0.25175032019615173, "learning_rate": 1.4004459980045125e-06, "loss": 0.4215, "step": 96 }, { "epoch": 0.11230101302460202, "grad_norm": 0.24384403228759766, "learning_rate": 6.22743712309054e-07, "loss": 0.4133, "step": 97 }, { "epoch": 0.11345875542691751, "grad_norm": 0.2955150604248047, "learning_rate": 1.557344345054501e-07, "loss": 0.4622, "step": 98 }, { "epoch": 0.114616497829233, "grad_norm": 0.2382400780916214, "learning_rate": 0.0, "loss": 0.4178, "step": 99 } ], "logging_steps": 1, "max_steps": 99, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.970786582868787e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }