{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.17543859649122806, "eval_steps": 30, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017543859649122807, "grad_norm": 8.17198371887207, "learning_rate": 5e-05, "loss": 14.9378, "step": 1 }, { "epoch": 0.0017543859649122807, "eval_loss": 3.2134883403778076, "eval_runtime": 4.6738, "eval_samples_per_second": 51.35, "eval_steps_per_second": 25.675, "step": 1 }, { "epoch": 0.0035087719298245615, "grad_norm": 7.3168792724609375, "learning_rate": 0.0001, "loss": 12.4736, "step": 2 }, { "epoch": 0.005263157894736842, "grad_norm": 9.029633522033691, "learning_rate": 0.00015, "loss": 13.3844, "step": 3 }, { "epoch": 0.007017543859649123, "grad_norm": 7.564419746398926, "learning_rate": 0.0002, "loss": 12.5128, "step": 4 }, { "epoch": 0.008771929824561403, "grad_norm": 8.678909301757812, "learning_rate": 0.00025, "loss": 13.0232, "step": 5 }, { "epoch": 0.010526315789473684, "grad_norm": 8.420906066894531, "learning_rate": 0.0003, "loss": 11.0386, "step": 6 }, { "epoch": 0.012280701754385965, "grad_norm": 7.58260440826416, "learning_rate": 0.00035, "loss": 10.529, "step": 7 }, { "epoch": 0.014035087719298246, "grad_norm": 6.728865623474121, "learning_rate": 0.0004, "loss": 9.3214, "step": 8 }, { "epoch": 0.015789473684210527, "grad_norm": 7.110503673553467, "learning_rate": 0.00045000000000000004, "loss": 9.4799, "step": 9 }, { "epoch": 0.017543859649122806, "grad_norm": 6.563074111938477, "learning_rate": 0.0005, "loss": 7.995, "step": 10 }, { "epoch": 0.01929824561403509, "grad_norm": 7.664244174957275, "learning_rate": 0.0004998477067547739, "loss": 9.3882, "step": 11 }, { "epoch": 0.021052631578947368, "grad_norm": 10.137924194335938, "learning_rate": 0.0004993910125649561, "loss": 8.5776, "step": 12 }, { "epoch": 0.02280701754385965, "grad_norm": 9.467455863952637, "learning_rate": 0.0004986304738420684, "loss": 7.9458, "step": 13 }, { "epoch": 0.02456140350877193, "grad_norm": 6.65028190612793, "learning_rate": 0.0004975670171853926, "loss": 7.1171, "step": 14 }, { "epoch": 0.02631578947368421, "grad_norm": 5.827754020690918, "learning_rate": 0.000496201938253052, "loss": 7.7485, "step": 15 }, { "epoch": 0.028070175438596492, "grad_norm": 5.733964443206787, "learning_rate": 0.0004945369001834514, "loss": 6.7287, "step": 16 }, { "epoch": 0.02982456140350877, "grad_norm": 6.325059413909912, "learning_rate": 0.0004925739315689991, "loss": 6.3347, "step": 17 }, { "epoch": 0.031578947368421054, "grad_norm": 5.700626373291016, "learning_rate": 0.0004903154239845797, "loss": 6.6159, "step": 18 }, { "epoch": 0.03333333333333333, "grad_norm": 5.114159107208252, "learning_rate": 0.0004877641290737884, "loss": 6.3928, "step": 19 }, { "epoch": 0.03508771929824561, "grad_norm": 6.295504570007324, "learning_rate": 0.0004849231551964771, "loss": 7.0686, "step": 20 }, { "epoch": 0.03684210526315789, "grad_norm": 5.665600776672363, "learning_rate": 0.00048179596364169685, "loss": 6.4039, "step": 21 }, { "epoch": 0.03859649122807018, "grad_norm": 5.866429805755615, "learning_rate": 0.0004783863644106502, "loss": 6.387, "step": 22 }, { "epoch": 0.04035087719298246, "grad_norm": 5.141241073608398, "learning_rate": 0.00047469851157479177, "loss": 6.344, "step": 23 }, { "epoch": 0.042105263157894736, "grad_norm": 4.911355972290039, "learning_rate": 0.00047073689821473173, "loss": 6.2085, "step": 24 }, { "epoch": 0.043859649122807015, "grad_norm": 5.492550373077393, "learning_rate": 0.00046650635094610973, "loss": 6.1914, "step": 25 }, { "epoch": 0.0456140350877193, "grad_norm": 4.625117301940918, "learning_rate": 0.00046201202403910646, "loss": 6.6255, "step": 26 }, { "epoch": 0.04736842105263158, "grad_norm": 5.429194927215576, "learning_rate": 0.00045725939313876043, "loss": 6.2794, "step": 27 }, { "epoch": 0.04912280701754386, "grad_norm": 4.251866340637207, "learning_rate": 0.0004522542485937369, "loss": 5.7326, "step": 28 }, { "epoch": 0.05087719298245614, "grad_norm": 4.642629146575928, "learning_rate": 0.00044700268840168044, "loss": 6.8355, "step": 29 }, { "epoch": 0.05263157894736842, "grad_norm": 5.152062892913818, "learning_rate": 0.0004415111107797445, "loss": 6.6811, "step": 30 }, { "epoch": 0.05263157894736842, "eval_loss": 1.5871975421905518, "eval_runtime": 4.2106, "eval_samples_per_second": 56.999, "eval_steps_per_second": 28.499, "step": 30 }, { "epoch": 0.054385964912280704, "grad_norm": 5.211429595947266, "learning_rate": 0.00043578620636934855, "loss": 6.8478, "step": 31 }, { "epoch": 0.056140350877192984, "grad_norm": 4.346696376800537, "learning_rate": 0.0004298349500846628, "loss": 6.7111, "step": 32 }, { "epoch": 0.05789473684210526, "grad_norm": 4.841087818145752, "learning_rate": 0.00042366459261474935, "loss": 6.8403, "step": 33 }, { "epoch": 0.05964912280701754, "grad_norm": 4.458794593811035, "learning_rate": 0.0004172826515897146, "loss": 6.521, "step": 34 }, { "epoch": 0.06140350877192982, "grad_norm": 4.778005599975586, "learning_rate": 0.0004106969024216348, "loss": 6.9271, "step": 35 }, { "epoch": 0.06315789473684211, "grad_norm": 4.167423248291016, "learning_rate": 0.00040391536883141455, "loss": 5.9028, "step": 36 }, { "epoch": 0.06491228070175438, "grad_norm": 4.609232425689697, "learning_rate": 0.0003969463130731183, "loss": 6.2515, "step": 37 }, { "epoch": 0.06666666666666667, "grad_norm": 5.198238372802734, "learning_rate": 0.0003897982258676867, "loss": 5.5528, "step": 38 }, { "epoch": 0.06842105263157895, "grad_norm": 4.857692241668701, "learning_rate": 0.00038247981605830125, "loss": 5.8487, "step": 39 }, { "epoch": 0.07017543859649122, "grad_norm": 4.872177600860596, "learning_rate": 0.000375, "loss": 6.95, "step": 40 }, { "epoch": 0.07192982456140351, "grad_norm": 4.162843227386475, "learning_rate": 0.0003673678906964727, "loss": 5.7473, "step": 41 }, { "epoch": 0.07368421052631578, "grad_norm": 5.556955337524414, "learning_rate": 0.00035959278669726934, "loss": 7.0701, "step": 42 }, { "epoch": 0.07543859649122807, "grad_norm": 5.303568363189697, "learning_rate": 0.0003516841607689501, "loss": 6.2933, "step": 43 }, { "epoch": 0.07719298245614035, "grad_norm": 4.08948278427124, "learning_rate": 0.00034365164835397803, "loss": 5.4379, "step": 44 }, { "epoch": 0.07894736842105263, "grad_norm": 4.459823131561279, "learning_rate": 0.0003355050358314172, "loss": 6.3031, "step": 45 }, { "epoch": 0.08070175438596491, "grad_norm": 4.328676700592041, "learning_rate": 0.00032725424859373687, "loss": 6.6055, "step": 46 }, { "epoch": 0.0824561403508772, "grad_norm": 3.6745636463165283, "learning_rate": 0.0003189093389542498, "loss": 5.9473, "step": 47 }, { "epoch": 0.08421052631578947, "grad_norm": 3.972637891769409, "learning_rate": 0.0003104804738999169, "loss": 5.7058, "step": 48 }, { "epoch": 0.08596491228070176, "grad_norm": 3.9648842811584473, "learning_rate": 0.0003019779227044398, "loss": 5.483, "step": 49 }, { "epoch": 0.08771929824561403, "grad_norm": 4.159786701202393, "learning_rate": 0.00029341204441673266, "loss": 6.0755, "step": 50 }, { "epoch": 0.08947368421052632, "grad_norm": 4.105205059051514, "learning_rate": 0.00028479327524001636, "loss": 5.9433, "step": 51 }, { "epoch": 0.0912280701754386, "grad_norm": 4.009059429168701, "learning_rate": 0.0002761321158169134, "loss": 6.3342, "step": 52 }, { "epoch": 0.09298245614035087, "grad_norm": 3.8199257850646973, "learning_rate": 0.0002674391184360313, "loss": 5.7817, "step": 53 }, { "epoch": 0.09473684210526316, "grad_norm": 4.428110122680664, "learning_rate": 0.0002587248741756253, "loss": 5.9798, "step": 54 }, { "epoch": 0.09649122807017543, "grad_norm": 3.8655333518981934, "learning_rate": 0.00025, "loss": 5.3399, "step": 55 }, { "epoch": 0.09824561403508772, "grad_norm": 4.174890518188477, "learning_rate": 0.00024127512582437484, "loss": 6.3818, "step": 56 }, { "epoch": 0.1, "grad_norm": 4.533304691314697, "learning_rate": 0.00023256088156396867, "loss": 6.332, "step": 57 }, { "epoch": 0.10175438596491228, "grad_norm": 4.358687400817871, "learning_rate": 0.00022386788418308668, "loss": 6.2303, "step": 58 }, { "epoch": 0.10350877192982456, "grad_norm": 4.2779035568237305, "learning_rate": 0.0002152067247599837, "loss": 6.5406, "step": 59 }, { "epoch": 0.10526315789473684, "grad_norm": 4.2128071784973145, "learning_rate": 0.00020658795558326743, "loss": 5.793, "step": 60 }, { "epoch": 0.10526315789473684, "eval_loss": 1.479407548904419, "eval_runtime": 4.2086, "eval_samples_per_second": 57.026, "eval_steps_per_second": 28.513, "step": 60 }, { "epoch": 0.10701754385964912, "grad_norm": 4.648571014404297, "learning_rate": 0.0001980220772955602, "loss": 6.2166, "step": 61 }, { "epoch": 0.10877192982456141, "grad_norm": 4.210023403167725, "learning_rate": 0.0001895195261000831, "loss": 6.2991, "step": 62 }, { "epoch": 0.11052631578947368, "grad_norm": 4.311450958251953, "learning_rate": 0.00018109066104575022, "loss": 6.5687, "step": 63 }, { "epoch": 0.11228070175438597, "grad_norm": 3.9065804481506348, "learning_rate": 0.00017274575140626317, "loss": 5.9996, "step": 64 }, { "epoch": 0.11403508771929824, "grad_norm": 3.7439773082733154, "learning_rate": 0.00016449496416858284, "loss": 5.8583, "step": 65 }, { "epoch": 0.11578947368421053, "grad_norm": 4.342467784881592, "learning_rate": 0.00015634835164602198, "loss": 5.668, "step": 66 }, { "epoch": 0.11754385964912281, "grad_norm": 3.9023163318634033, "learning_rate": 0.00014831583923105, "loss": 5.5534, "step": 67 }, { "epoch": 0.11929824561403508, "grad_norm": 3.926267147064209, "learning_rate": 0.00014040721330273062, "loss": 5.4874, "step": 68 }, { "epoch": 0.12105263157894737, "grad_norm": 4.336432456970215, "learning_rate": 0.00013263210930352737, "loss": 6.197, "step": 69 }, { "epoch": 0.12280701754385964, "grad_norm": 3.996743679046631, "learning_rate": 0.00012500000000000006, "loss": 6.2836, "step": 70 }, { "epoch": 0.12456140350877193, "grad_norm": 3.9761931896209717, "learning_rate": 0.0001175201839416988, "loss": 5.7974, "step": 71 }, { "epoch": 0.12631578947368421, "grad_norm": 4.102511405944824, "learning_rate": 0.00011020177413231333, "loss": 5.691, "step": 72 }, { "epoch": 0.1280701754385965, "grad_norm": 4.003103733062744, "learning_rate": 0.00010305368692688174, "loss": 6.0781, "step": 73 }, { "epoch": 0.12982456140350876, "grad_norm": 3.9753754138946533, "learning_rate": 9.608463116858542e-05, "loss": 6.4986, "step": 74 }, { "epoch": 0.13157894736842105, "grad_norm": 3.6435089111328125, "learning_rate": 8.930309757836516e-05, "loss": 5.8422, "step": 75 }, { "epoch": 0.13333333333333333, "grad_norm": 4.879597187042236, "learning_rate": 8.271734841028553e-05, "loss": 6.6415, "step": 76 }, { "epoch": 0.13508771929824562, "grad_norm": 4.149867534637451, "learning_rate": 7.633540738525066e-05, "loss": 5.9788, "step": 77 }, { "epoch": 0.1368421052631579, "grad_norm": 4.045780181884766, "learning_rate": 7.016504991533726e-05, "loss": 6.4516, "step": 78 }, { "epoch": 0.13859649122807016, "grad_norm": 4.346358299255371, "learning_rate": 6.421379363065141e-05, "loss": 6.3082, "step": 79 }, { "epoch": 0.14035087719298245, "grad_norm": 4.118051052093506, "learning_rate": 5.848888922025553e-05, "loss": 5.5206, "step": 80 }, { "epoch": 0.14210526315789473, "grad_norm": 3.69450044631958, "learning_rate": 5.299731159831953e-05, "loss": 5.4157, "step": 81 }, { "epoch": 0.14385964912280702, "grad_norm": 4.054089546203613, "learning_rate": 4.7745751406263163e-05, "loss": 6.0685, "step": 82 }, { "epoch": 0.1456140350877193, "grad_norm": 3.7574148178100586, "learning_rate": 4.274060686123959e-05, "loss": 5.4767, "step": 83 }, { "epoch": 0.14736842105263157, "grad_norm": 4.498287200927734, "learning_rate": 3.798797596089351e-05, "loss": 6.6051, "step": 84 }, { "epoch": 0.14912280701754385, "grad_norm": 4.071403980255127, "learning_rate": 3.3493649053890325e-05, "loss": 5.0443, "step": 85 }, { "epoch": 0.15087719298245614, "grad_norm": 4.087273597717285, "learning_rate": 2.9263101785268254e-05, "loss": 5.9345, "step": 86 }, { "epoch": 0.15263157894736842, "grad_norm": 3.6338565349578857, "learning_rate": 2.5301488425208295e-05, "loss": 5.2453, "step": 87 }, { "epoch": 0.1543859649122807, "grad_norm": 4.064896583557129, "learning_rate": 2.1613635589349755e-05, "loss": 6.2742, "step": 88 }, { "epoch": 0.156140350877193, "grad_norm": 4.007572174072266, "learning_rate": 1.8204036358303172e-05, "loss": 5.6121, "step": 89 }, { "epoch": 0.15789473684210525, "grad_norm": 3.665036916732788, "learning_rate": 1.5076844803522921e-05, "loss": 5.5603, "step": 90 }, { "epoch": 0.15789473684210525, "eval_loss": 1.4289391040802002, "eval_runtime": 4.2102, "eval_samples_per_second": 57.005, "eval_steps_per_second": 28.502, "step": 90 }, { "epoch": 0.15964912280701754, "grad_norm": 4.303161144256592, "learning_rate": 1.2235870926211617e-05, "loss": 6.1717, "step": 91 }, { "epoch": 0.16140350877192983, "grad_norm": 4.152341842651367, "learning_rate": 9.684576015420277e-06, "loss": 6.4078, "step": 92 }, { "epoch": 0.1631578947368421, "grad_norm": 3.9874086380004883, "learning_rate": 7.426068431000882e-06, "loss": 6.0923, "step": 93 }, { "epoch": 0.1649122807017544, "grad_norm": 4.599057197570801, "learning_rate": 5.463099816548578e-06, "loss": 6.3411, "step": 94 }, { "epoch": 0.16666666666666666, "grad_norm": 5.075430393218994, "learning_rate": 3.798061746947995e-06, "loss": 6.3153, "step": 95 }, { "epoch": 0.16842105263157894, "grad_norm": 4.5220046043396, "learning_rate": 2.4329828146074094e-06, "loss": 6.3629, "step": 96 }, { "epoch": 0.17017543859649123, "grad_norm": 4.01290225982666, "learning_rate": 1.3695261579316775e-06, "loss": 5.3428, "step": 97 }, { "epoch": 0.17192982456140352, "grad_norm": 4.395105838775635, "learning_rate": 6.089874350439506e-07, "loss": 6.2479, "step": 98 }, { "epoch": 0.1736842105263158, "grad_norm": 4.438303470611572, "learning_rate": 1.5229324522605948e-07, "loss": 6.0653, "step": 99 }, { "epoch": 0.17543859649122806, "grad_norm": 3.508668899536133, "learning_rate": 0.0, "loss": 5.1474, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6529493237760000.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }