{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03965551772481474, "eval_steps": 50, "global_step": 99, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004005607850991388, "grad_norm": 2.4344990253448486, "learning_rate": 5e-05, "loss": 3.1012, "step": 1 }, { "epoch": 0.0004005607850991388, "eval_loss": 3.0820372104644775, "eval_runtime": 28.6956, "eval_samples_per_second": 36.661, "eval_steps_per_second": 18.33, "step": 1 }, { "epoch": 0.0008011215701982776, "grad_norm": 1.7689874172210693, "learning_rate": 0.0001, "loss": 2.9772, "step": 2 }, { "epoch": 0.0012016823552974164, "grad_norm": 2.805964708328247, "learning_rate": 0.00015, "loss": 3.4926, "step": 3 }, { "epoch": 0.0016022431403965552, "grad_norm": 2.956672430038452, "learning_rate": 0.0002, "loss": 3.6622, "step": 4 }, { "epoch": 0.002002803925495694, "grad_norm": 2.5529518127441406, "learning_rate": 0.00025, "loss": 2.9273, "step": 5 }, { "epoch": 0.002403364710594833, "grad_norm": 1.831852912902832, "learning_rate": 0.0003, "loss": 2.5788, "step": 6 }, { "epoch": 0.0028039254956939716, "grad_norm": 1.6226868629455566, "learning_rate": 0.00035, "loss": 2.5177, "step": 7 }, { "epoch": 0.0032044862807931104, "grad_norm": 2.354496955871582, "learning_rate": 0.0004, "loss": 2.5655, "step": 8 }, { "epoch": 0.0036050470658922492, "grad_norm": 2.2879300117492676, "learning_rate": 0.00045000000000000004, "loss": 2.5502, "step": 9 }, { "epoch": 0.004005607850991388, "grad_norm": 2.574815034866333, "learning_rate": 0.0005, "loss": 2.2454, "step": 10 }, { "epoch": 0.004406168636090527, "grad_norm": 1.6621110439300537, "learning_rate": 0.0004998442655654946, "loss": 1.8034, "step": 11 }, { "epoch": 0.004806729421189666, "grad_norm": 1.984192132949829, "learning_rate": 0.0004993772562876909, "loss": 2.6323, "step": 12 }, { "epoch": 0.0052072902062888045, "grad_norm": 2.288874864578247, "learning_rate": 0.0004985995540019955, "loss": 1.9088, "step": 13 }, { "epoch": 0.005607850991387943, "grad_norm": 1.5363634824752808, "learning_rate": 0.0004975121276286136, "loss": 2.0859, "step": 14 }, { "epoch": 0.006008411776487082, "grad_norm": 1.1328516006469727, "learning_rate": 0.0004961163319653958, "loss": 1.9693, "step": 15 }, { "epoch": 0.006408972561586221, "grad_norm": 1.3997936248779297, "learning_rate": 0.0004944139059999286, "loss": 1.9882, "step": 16 }, { "epoch": 0.00680953334668536, "grad_norm": 1.317238688468933, "learning_rate": 0.000492406970742972, "loss": 1.9312, "step": 17 }, { "epoch": 0.0072100941317844985, "grad_norm": 1.458006501197815, "learning_rate": 0.0004900980265859448, "loss": 2.4625, "step": 18 }, { "epoch": 0.007610654916883637, "grad_norm": 0.970038115978241, "learning_rate": 0.0004874899501857477, "loss": 2.2326, "step": 19 }, { "epoch": 0.008011215701982776, "grad_norm": 1.6142843961715698, "learning_rate": 0.00048458599088080736, "loss": 1.9539, "step": 20 }, { "epoch": 0.008411776487081914, "grad_norm": 1.353042483329773, "learning_rate": 0.0004813897666428053, "loss": 2.335, "step": 21 }, { "epoch": 0.008812337272181054, "grad_norm": 1.423920750617981, "learning_rate": 0.00047790525956913543, "loss": 1.7413, "step": 22 }, { "epoch": 0.009212898057280192, "grad_norm": 1.5239485502243042, "learning_rate": 0.0004741368109217071, "loss": 2.2131, "step": 23 }, { "epoch": 0.009613458842379331, "grad_norm": 1.8239572048187256, "learning_rate": 0.00047008911571827283, "loss": 1.7396, "step": 24 }, { "epoch": 0.01001401962747847, "grad_norm": 1.0884772539138794, "learning_rate": 0.00046576721688302105, "loss": 2.2057, "step": 25 }, { "epoch": 0.010414580412577609, "grad_norm": 1.4631233215332031, "learning_rate": 0.0004611764989637205, "loss": 1.9047, "step": 26 }, { "epoch": 0.010815141197676747, "grad_norm": 1.1582131385803223, "learning_rate": 0.0004563226814232444, "loss": 2.5048, "step": 27 }, { "epoch": 0.011215701982775887, "grad_norm": 1.416812777519226, "learning_rate": 0.0004512118115138315, "loss": 1.8986, "step": 28 }, { "epoch": 0.011616262767875024, "grad_norm": 1.1690479516983032, "learning_rate": 0.0004458502567429631, "loss": 2.1514, "step": 29 }, { "epoch": 0.012016823552974164, "grad_norm": 1.16459059715271, "learning_rate": 0.00044024469694024196, "loss": 1.8639, "step": 30 }, { "epoch": 0.012417384338073302, "grad_norm": 1.056779384613037, "learning_rate": 0.00043440211593515554, "loss": 1.9256, "step": 31 }, { "epoch": 0.012817945123172442, "grad_norm": 1.5517312288284302, "learning_rate": 0.0004283297928560951, "loss": 1.9297, "step": 32 }, { "epoch": 0.01321850590827158, "grad_norm": 1.3826098442077637, "learning_rate": 0.0004220352930614672, "loss": 2.282, "step": 33 }, { "epoch": 0.01361906669337072, "grad_norm": 1.194398283958435, "learning_rate": 0.00041552645871420013, "loss": 2.2492, "step": 34 }, { "epoch": 0.014019627478469857, "grad_norm": 1.8520417213439941, "learning_rate": 0.00040881139901138467, "loss": 2.4, "step": 35 }, { "epoch": 0.014420188263568997, "grad_norm": 1.278959035873413, "learning_rate": 0.00040189848008122475, "loss": 2.1325, "step": 36 }, { "epoch": 0.014820749048668135, "grad_norm": 1.786801815032959, "learning_rate": 0.00039479631455988334, "loss": 2.0348, "step": 37 }, { "epoch": 0.015221309833767275, "grad_norm": 1.3184572458267212, "learning_rate": 0.0003875137508612103, "loss": 2.0454, "step": 38 }, { "epoch": 0.015621870618866412, "grad_norm": 1.1530405282974243, "learning_rate": 0.00038005986215272055, "loss": 2.2434, "step": 39 }, { "epoch": 0.016022431403965552, "grad_norm": 1.4723411798477173, "learning_rate": 0.0003724439350515571, "loss": 1.8833, "step": 40 }, { "epoch": 0.016422992189064692, "grad_norm": 1.6857566833496094, "learning_rate": 0.0003646754580545226, "loss": 2.2384, "step": 41 }, { "epoch": 0.016823552974163828, "grad_norm": 1.2251979112625122, "learning_rate": 0.000356764109716594, "loss": 2.5669, "step": 42 }, { "epoch": 0.017224113759262968, "grad_norm": 1.5817160606384277, "learning_rate": 0.00034871974659264783, "loss": 2.5998, "step": 43 }, { "epoch": 0.017624674544362107, "grad_norm": 1.326270580291748, "learning_rate": 0.0003405523909574206, "loss": 2.2238, "step": 44 }, { "epoch": 0.018025235329461247, "grad_norm": 0.9494209885597229, "learning_rate": 0.0003322722183190025, "loss": 2.0566, "step": 45 }, { "epoch": 0.018425796114560383, "grad_norm": 1.1728250980377197, "learning_rate": 0.0003238895447414211, "loss": 1.697, "step": 46 }, { "epoch": 0.018826356899659523, "grad_norm": 1.5986175537109375, "learning_rate": 0.0003154148139921102, "loss": 1.7549, "step": 47 }, { "epoch": 0.019226917684758663, "grad_norm": 1.1737697124481201, "learning_rate": 0.00030685858453027663, "loss": 1.7618, "step": 48 }, { "epoch": 0.019627478469857802, "grad_norm": 1.0616875886917114, "learning_rate": 0.0002982315163523742, "loss": 2.3967, "step": 49 }, { "epoch": 0.02002803925495694, "grad_norm": 1.4347580671310425, "learning_rate": 0.000289544357711076, "loss": 1.7634, "step": 50 }, { "epoch": 0.02002803925495694, "eval_loss": 2.012052536010742, "eval_runtime": 28.5887, "eval_samples_per_second": 36.798, "eval_steps_per_second": 18.399, "step": 50 }, { "epoch": 0.020428600040056078, "grad_norm": 1.5467963218688965, "learning_rate": 0.0002808079317242896, "loss": 2.0926, "step": 51 }, { "epoch": 0.020829160825155218, "grad_norm": 1.1608525514602661, "learning_rate": 0.0002720331228909005, "loss": 1.9368, "step": 52 }, { "epoch": 0.021229721610254357, "grad_norm": 1.5125212669372559, "learning_rate": 0.00026323086353004075, "loss": 2.2591, "step": 53 }, { "epoch": 0.021630282395353494, "grad_norm": 1.141327142715454, "learning_rate": 0.0002544121201607822, "loss": 1.8238, "step": 54 }, { "epoch": 0.022030843180452633, "grad_norm": 1.1864770650863647, "learning_rate": 0.00024558787983921783, "loss": 1.9923, "step": 55 }, { "epoch": 0.022431403965551773, "grad_norm": 1.2866952419281006, "learning_rate": 0.0002367691364699592, "loss": 1.9187, "step": 56 }, { "epoch": 0.022831964750650913, "grad_norm": 1.362595558166504, "learning_rate": 0.00022796687710909964, "loss": 2.0923, "step": 57 }, { "epoch": 0.02323252553575005, "grad_norm": 1.3138153553009033, "learning_rate": 0.00021919206827571036, "loss": 2.0055, "step": 58 }, { "epoch": 0.02363308632084919, "grad_norm": 1.1482343673706055, "learning_rate": 0.00021045564228892402, "loss": 1.9882, "step": 59 }, { "epoch": 0.024033647105948328, "grad_norm": 1.2140475511550903, "learning_rate": 0.00020176848364762578, "loss": 1.725, "step": 60 }, { "epoch": 0.024434207891047468, "grad_norm": 1.5149836540222168, "learning_rate": 0.00019314141546972343, "loss": 2.1587, "step": 61 }, { "epoch": 0.024834768676146604, "grad_norm": 1.5307202339172363, "learning_rate": 0.00018458518600788986, "loss": 2.1868, "step": 62 }, { "epoch": 0.025235329461245744, "grad_norm": 1.1949517726898193, "learning_rate": 0.00017611045525857898, "loss": 2.0899, "step": 63 }, { "epoch": 0.025635890246344883, "grad_norm": 1.2019050121307373, "learning_rate": 0.0001677277816809975, "loss": 1.7959, "step": 64 }, { "epoch": 0.026036451031444023, "grad_norm": 1.137641429901123, "learning_rate": 0.00015944760904257942, "loss": 1.9071, "step": 65 }, { "epoch": 0.02643701181654316, "grad_norm": 1.4122196435928345, "learning_rate": 0.0001512802534073522, "loss": 1.9875, "step": 66 }, { "epoch": 0.0268375726016423, "grad_norm": 1.3938771486282349, "learning_rate": 0.00014323589028340596, "loss": 1.9302, "step": 67 }, { "epoch": 0.02723813338674144, "grad_norm": 1.1219323873519897, "learning_rate": 0.00013532454194547733, "loss": 1.9583, "step": 68 }, { "epoch": 0.02763869417184058, "grad_norm": 1.221145749092102, "learning_rate": 0.00012755606494844294, "loss": 2.1028, "step": 69 }, { "epoch": 0.028039254956939715, "grad_norm": 1.6395137310028076, "learning_rate": 0.00011994013784727947, "loss": 1.955, "step": 70 }, { "epoch": 0.028439815742038854, "grad_norm": 1.0482176542282104, "learning_rate": 0.00011248624913878966, "loss": 2.112, "step": 71 }, { "epoch": 0.028840376527137994, "grad_norm": 1.2491412162780762, "learning_rate": 0.0001052036854401166, "loss": 2.2217, "step": 72 }, { "epoch": 0.029240937312237134, "grad_norm": 1.3600395917892456, "learning_rate": 9.810151991877531e-05, "loss": 2.2656, "step": 73 }, { "epoch": 0.02964149809733627, "grad_norm": 1.2350081205368042, "learning_rate": 9.118860098861537e-05, "loss": 2.0685, "step": 74 }, { "epoch": 0.03004205888243541, "grad_norm": 1.2208542823791504, "learning_rate": 8.44735412857999e-05, "loss": 1.539, "step": 75 }, { "epoch": 0.03044261966753455, "grad_norm": 1.370618462562561, "learning_rate": 7.79647069385328e-05, "loss": 2.0545, "step": 76 }, { "epoch": 0.03084318045263369, "grad_norm": 1.1490741968154907, "learning_rate": 7.167020714390501e-05, "loss": 2.0587, "step": 77 }, { "epoch": 0.031243741237732825, "grad_norm": 1.220718502998352, "learning_rate": 6.559788406484446e-05, "loss": 1.8599, "step": 78 }, { "epoch": 0.031644302022831965, "grad_norm": 1.64824378490448, "learning_rate": 5.975530305975807e-05, "loss": 2.3586, "step": 79 }, { "epoch": 0.032044862807931104, "grad_norm": 1.495094656944275, "learning_rate": 5.414974325703686e-05, "loss": 2.1085, "step": 80 }, { "epoch": 0.032445423593030244, "grad_norm": 1.1038875579833984, "learning_rate": 4.8788188486168616e-05, "loss": 2.1465, "step": 81 }, { "epoch": 0.032845984378129384, "grad_norm": 1.450072169303894, "learning_rate": 4.367731857675569e-05, "loss": 1.9168, "step": 82 }, { "epoch": 0.03324654516322852, "grad_norm": 1.2426930665969849, "learning_rate": 3.882350103627952e-05, "loss": 2.0372, "step": 83 }, { "epoch": 0.033647105948327656, "grad_norm": 1.189127802848816, "learning_rate": 3.423278311697897e-05, "loss": 1.6364, "step": 84 }, { "epoch": 0.034047666733426796, "grad_norm": 1.3084362745285034, "learning_rate": 2.9910884281727225e-05, "loss": 2.0304, "step": 85 }, { "epoch": 0.034448227518525935, "grad_norm": 0.7917243838310242, "learning_rate": 2.586318907829291e-05, "loss": 2.3032, "step": 86 }, { "epoch": 0.034848788303625075, "grad_norm": 1.2357211112976074, "learning_rate": 2.209474043086457e-05, "loss": 1.8531, "step": 87 }, { "epoch": 0.035249349088724215, "grad_norm": 1.5522675514221191, "learning_rate": 1.861023335719475e-05, "loss": 2.0952, "step": 88 }, { "epoch": 0.035649909873823354, "grad_norm": 1.166305422782898, "learning_rate": 1.5414009119192633e-05, "loss": 1.7682, "step": 89 }, { "epoch": 0.036050470658922494, "grad_norm": 1.2451746463775635, "learning_rate": 1.25100498142523e-05, "loss": 1.9993, "step": 90 }, { "epoch": 0.03645103144402163, "grad_norm": 1.251763105392456, "learning_rate": 9.901973414055187e-06, "loss": 1.9152, "step": 91 }, { "epoch": 0.036851592229120766, "grad_norm": 1.1293752193450928, "learning_rate": 7.593029257027956e-06, "loss": 2.0776, "step": 92 }, { "epoch": 0.037252153014219906, "grad_norm": 1.1258478164672852, "learning_rate": 5.5860940000714015e-06, "loss": 2.1167, "step": 93 }, { "epoch": 0.037652713799319046, "grad_norm": 1.185185432434082, "learning_rate": 3.8836680346041594e-06, "loss": 1.8718, "step": 94 }, { "epoch": 0.038053274584418185, "grad_norm": 1.2053008079528809, "learning_rate": 2.487872371386424e-06, "loss": 2.3429, "step": 95 }, { "epoch": 0.038453835369517325, "grad_norm": 0.8643404245376587, "learning_rate": 1.4004459980045125e-06, "loss": 1.8221, "step": 96 }, { "epoch": 0.038854396154616465, "grad_norm": 1.4989515542984009, "learning_rate": 6.22743712309054e-07, "loss": 1.8639, "step": 97 }, { "epoch": 0.039254956939715605, "grad_norm": 1.2497055530548096, "learning_rate": 1.557344345054501e-07, "loss": 2.279, "step": 98 }, { "epoch": 0.03965551772481474, "grad_norm": 1.5901930332183838, "learning_rate": 0.0, "loss": 2.0944, "step": 99 } ], "logging_steps": 1, "max_steps": 99, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5038480606887936.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }