{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9998823021695635, "eval_steps": 187, "global_step": 9558, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003138608811644239, "grad_norm": 0.07275390625, "learning_rate": 0.0002, "loss": 1.6202, "step": 1 }, { "epoch": 0.0003138608811644239, "eval_loss": 3.076237440109253, "eval_runtime": 148.221, "eval_samples_per_second": 6.747, "eval_steps_per_second": 6.747, "step": 1 }, { "epoch": 0.0003138608811644239, "mmlu_eval_accuracy": 0.44607718879653785, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.4375, "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, "mmlu_eval_accuracy_clinical_knowledge": 0.3793103448275862, "mmlu_eval_accuracy_college_biology": 0.25, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454, "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, "mmlu_eval_accuracy_college_medicine": 0.36363636363636365, "mmlu_eval_accuracy_college_physics": 0.5454545454545454, "mmlu_eval_accuracy_computer_security": 0.45454545454545453, "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, "mmlu_eval_accuracy_econometrics": 0.3333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.375, "mmlu_eval_accuracy_elementary_mathematics": 0.34146341463414637, "mmlu_eval_accuracy_formal_logic": 0.21428571428571427, "mmlu_eval_accuracy_global_facts": 0.1, "mmlu_eval_accuracy_high_school_biology": 0.34375, "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666, "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, "mmlu_eval_accuracy_high_school_geography": 0.5909090909090909, "mmlu_eval_accuracy_high_school_government_and_politics": 0.6666666666666666, "mmlu_eval_accuracy_high_school_macroeconomics": 0.27906976744186046, "mmlu_eval_accuracy_high_school_mathematics": 0.3103448275862069, "mmlu_eval_accuracy_high_school_microeconomics": 0.38461538461538464, "mmlu_eval_accuracy_high_school_physics": 0.29411764705882354, "mmlu_eval_accuracy_high_school_psychology": 0.75, "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, "mmlu_eval_accuracy_high_school_us_history": 0.6363636363636364, "mmlu_eval_accuracy_high_school_world_history": 0.5769230769230769, "mmlu_eval_accuracy_human_aging": 0.6086956521739131, "mmlu_eval_accuracy_human_sexuality": 0.5, "mmlu_eval_accuracy_international_law": 0.7692307692307693, "mmlu_eval_accuracy_jurisprudence": 0.45454545454545453, "mmlu_eval_accuracy_logical_fallacies": 0.5555555555555556, "mmlu_eval_accuracy_machine_learning": 0.18181818181818182, "mmlu_eval_accuracy_management": 0.2727272727272727, "mmlu_eval_accuracy_marketing": 0.84, "mmlu_eval_accuracy_medical_genetics": 0.8181818181818182, "mmlu_eval_accuracy_miscellaneous": 0.6046511627906976, "mmlu_eval_accuracy_moral_disputes": 0.4473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.23, "mmlu_eval_accuracy_nutrition": 0.48484848484848486, "mmlu_eval_accuracy_philosophy": 0.4411764705882353, "mmlu_eval_accuracy_prehistory": 0.3142857142857143, "mmlu_eval_accuracy_professional_accounting": 0.3548387096774194, "mmlu_eval_accuracy_professional_law": 0.36470588235294116, "mmlu_eval_accuracy_professional_medicine": 0.3870967741935484, "mmlu_eval_accuracy_professional_psychology": 0.36231884057971014, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.4444444444444444, "mmlu_eval_accuracy_sociology": 0.6363636363636364, "mmlu_eval_accuracy_us_foreign_policy": 0.7272727272727273, "mmlu_eval_accuracy_virology": 0.3888888888888889, "mmlu_eval_accuracy_world_religions": 0.6842105263157895, "mmlu_loss": 3.0195472597842277, "step": 1 }, { "epoch": 0.0006277217623288478, "grad_norm": 0.0849609375, "learning_rate": 0.0002, "loss": 1.6781, "step": 2 }, { "epoch": 0.0009415826434932716, "grad_norm": 0.12890625, "learning_rate": 0.0002, "loss": 1.6066, "step": 3 }, { "epoch": 0.0012554435246576955, "grad_norm": 0.1318359375, "learning_rate": 0.0002, "loss": 1.6144, "step": 4 }, { "epoch": 0.0015693044058221193, "grad_norm": 0.12158203125, "learning_rate": 0.0002, "loss": 1.5999, "step": 5 }, { "epoch": 0.0018831652869865433, "grad_norm": 0.10693359375, "learning_rate": 0.0002, "loss": 1.4956, "step": 6 }, { "epoch": 0.002197026168150967, "grad_norm": 0.10546875, "learning_rate": 0.0002, "loss": 1.5835, "step": 7 }, { "epoch": 0.002510887049315391, "grad_norm": 0.1123046875, "learning_rate": 0.0002, "loss": 1.4302, "step": 8 }, { "epoch": 0.002824747930479815, "grad_norm": 0.1240234375, "learning_rate": 0.0002, "loss": 1.5676, "step": 9 }, { "epoch": 0.0031386088116442386, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.573, "step": 10 }, { "epoch": 0.0034524696928086625, "grad_norm": 0.123046875, "learning_rate": 0.0002, "loss": 1.4498, "step": 11 }, { "epoch": 0.0037663305739730865, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.7236, "step": 12 }, { "epoch": 0.0040801914551375105, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5408, "step": 13 }, { "epoch": 0.004394052336301934, "grad_norm": 0.130859375, "learning_rate": 0.0002, "loss": 1.4307, "step": 14 }, { "epoch": 0.004707913217466358, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.584, "step": 15 }, { "epoch": 0.005021774098630782, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.4904, "step": 16 }, { "epoch": 0.005335634979795206, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 1.848, "step": 17 }, { "epoch": 0.00564949586095963, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.5244, "step": 18 }, { "epoch": 0.005963356742124054, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 1.4058, "step": 19 }, { "epoch": 0.006277217623288477, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 1.5462, "step": 20 }, { "epoch": 0.0065910785044529015, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.6042, "step": 21 }, { "epoch": 0.006904939385617325, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.6849, "step": 22 }, { "epoch": 0.007218800266781749, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 2.0465, "step": 23 }, { "epoch": 0.007532661147946173, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 1.6686, "step": 24 }, { "epoch": 0.007846522029110597, "grad_norm": 0.875, "learning_rate": 0.0002, "loss": 1.7766, "step": 25 }, { "epoch": 0.008160382910275021, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 1.9928, "step": 26 }, { "epoch": 0.008474243791439445, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 2.442, "step": 27 }, { "epoch": 0.008788104672603868, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 1.9709, "step": 28 }, { "epoch": 0.009101965553768292, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 2.1932, "step": 29 }, { "epoch": 0.009415826434932715, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 2.1419, "step": 30 }, { "epoch": 0.00972968731609714, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 2.1, "step": 31 }, { "epoch": 0.010043548197261564, "grad_norm": 1.109375, "learning_rate": 0.0002, "loss": 2.4471, "step": 32 }, { "epoch": 0.010357409078425988, "grad_norm": 1.0546875, "learning_rate": 0.0002, "loss": 2.6097, "step": 33 }, { "epoch": 0.010671269959590411, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 2.2288, "step": 34 }, { "epoch": 0.010985130840754835, "grad_norm": 1.203125, "learning_rate": 0.0002, "loss": 2.6281, "step": 35 }, { "epoch": 0.01129899172191926, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 1.9545, "step": 36 }, { "epoch": 0.011612852603083684, "grad_norm": 1.3671875, "learning_rate": 0.0002, "loss": 2.2779, "step": 37 }, { "epoch": 0.011926713484248107, "grad_norm": 0.95703125, "learning_rate": 0.0002, "loss": 2.5123, "step": 38 }, { "epoch": 0.01224057436541253, "grad_norm": 1.1953125, "learning_rate": 0.0002, "loss": 2.5084, "step": 39 }, { "epoch": 0.012554435246576954, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 2.5873, "step": 40 }, { "epoch": 0.012868296127741378, "grad_norm": 1.0546875, "learning_rate": 0.0002, "loss": 2.4971, "step": 41 }, { "epoch": 0.013182157008905803, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 2.3611, "step": 42 }, { "epoch": 0.013496017890070227, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 2.9962, "step": 43 }, { "epoch": 0.01380987877123465, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 2.3759, "step": 44 }, { "epoch": 0.014123739652399074, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 2.4454, "step": 45 }, { "epoch": 0.014437600533563497, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 2.1861, "step": 46 }, { "epoch": 0.014751461414727923, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 2.5668, "step": 47 }, { "epoch": 0.015065322295892346, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 2.0805, "step": 48 }, { "epoch": 0.01537918317705677, "grad_norm": 0.87109375, "learning_rate": 0.0002, "loss": 2.8306, "step": 49 }, { "epoch": 0.015693044058221195, "grad_norm": 1.0546875, "learning_rate": 0.0002, "loss": 2.2581, "step": 50 }, { "epoch": 0.016006904939385617, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.5381, "step": 51 }, { "epoch": 0.016320765820550042, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.3098, "step": 52 }, { "epoch": 0.016634626701714464, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.5292, "step": 53 }, { "epoch": 0.01694848758287889, "grad_norm": 0.2138671875, "learning_rate": 0.0002, "loss": 1.6597, "step": 54 }, { "epoch": 0.017262348464043314, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.4327, "step": 55 }, { "epoch": 0.017576209345207736, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.4566, "step": 56 }, { "epoch": 0.01789007022637216, "grad_norm": 0.125, "learning_rate": 0.0002, "loss": 1.3754, "step": 57 }, { "epoch": 0.018203931107536583, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.7264, "step": 58 }, { "epoch": 0.01851779198870101, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.4453, "step": 59 }, { "epoch": 0.01883165286986543, "grad_norm": 0.2314453125, "learning_rate": 0.0002, "loss": 1.463, "step": 60 }, { "epoch": 0.019145513751029856, "grad_norm": 0.1318359375, "learning_rate": 0.0002, "loss": 1.4478, "step": 61 }, { "epoch": 0.01945937463219428, "grad_norm": 0.2451171875, "learning_rate": 0.0002, "loss": 1.5525, "step": 62 }, { "epoch": 0.019773235513358703, "grad_norm": 0.1298828125, "learning_rate": 0.0002, "loss": 1.466, "step": 63 }, { "epoch": 0.020087096394523128, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 1.3745, "step": 64 }, { "epoch": 0.02040095727568755, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.4817, "step": 65 }, { "epoch": 0.020714818156851975, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.4156, "step": 66 }, { "epoch": 0.0210286790380164, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 1.7504, "step": 67 }, { "epoch": 0.021342539919180822, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.4968, "step": 68 }, { "epoch": 0.021656400800345248, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.585, "step": 69 }, { "epoch": 0.02197026168150967, "grad_norm": 0.1943359375, "learning_rate": 0.0002, "loss": 1.5652, "step": 70 }, { "epoch": 0.022284122562674095, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.2941, "step": 71 }, { "epoch": 0.02259798344383852, "grad_norm": 0.24609375, "learning_rate": 0.0002, "loss": 1.7452, "step": 72 }, { "epoch": 0.022911844325002942, "grad_norm": 0.203125, "learning_rate": 0.0002, "loss": 1.5938, "step": 73 }, { "epoch": 0.023225705206167367, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.781, "step": 74 }, { "epoch": 0.02353956608733179, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.792, "step": 75 }, { "epoch": 0.023853426968496214, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 1.9774, "step": 76 }, { "epoch": 0.02416728784966064, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 2.4913, "step": 77 }, { "epoch": 0.02448114873082506, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 2.0286, "step": 78 }, { "epoch": 0.024795009611989487, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 2.3414, "step": 79 }, { "epoch": 0.02510887049315391, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 1.8796, "step": 80 }, { "epoch": 0.025422731374318334, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 1.9177, "step": 81 }, { "epoch": 0.025736592255482756, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 2.7417, "step": 82 }, { "epoch": 0.02605045313664718, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 2.3617, "step": 83 }, { "epoch": 0.026364314017811606, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 2.418, "step": 84 }, { "epoch": 0.026678174898976028, "grad_norm": 1.2578125, "learning_rate": 0.0002, "loss": 2.7761, "step": 85 }, { "epoch": 0.026992035780140453, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 2.3215, "step": 86 }, { "epoch": 0.027305896661304875, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 2.1868, "step": 87 }, { "epoch": 0.0276197575424693, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 2.0753, "step": 88 }, { "epoch": 0.027933618423633726, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 2.2398, "step": 89 }, { "epoch": 0.028247479304798147, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 2.4929, "step": 90 }, { "epoch": 0.028561340185962573, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 2.4407, "step": 91 }, { "epoch": 0.028875201067126995, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 2.7458, "step": 92 }, { "epoch": 0.02918906194829142, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 2.0544, "step": 93 }, { "epoch": 0.029502922829455845, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 2.6945, "step": 94 }, { "epoch": 0.029816783710620267, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 2.3824, "step": 95 }, { "epoch": 0.030130644591784692, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 2.2367, "step": 96 }, { "epoch": 0.030444505472949114, "grad_norm": 1.21875, "learning_rate": 0.0002, "loss": 2.6735, "step": 97 }, { "epoch": 0.03075836635411354, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 2.3055, "step": 98 }, { "epoch": 0.031072227235277965, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 2.432, "step": 99 }, { "epoch": 0.03138608811644239, "grad_norm": 0.875, "learning_rate": 0.0002, "loss": 3.0268, "step": 100 }, { "epoch": 0.03169994899760681, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 1.5252, "step": 101 }, { "epoch": 0.032013809878771234, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.5605, "step": 102 }, { "epoch": 0.03232767075993566, "grad_norm": 0.19921875, "learning_rate": 0.0002, "loss": 1.5084, "step": 103 }, { "epoch": 0.032641531641100084, "grad_norm": 0.205078125, "learning_rate": 0.0002, "loss": 1.462, "step": 104 }, { "epoch": 0.03295539252226451, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.6249, "step": 105 }, { "epoch": 0.03326925340342893, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.641, "step": 106 }, { "epoch": 0.03358311428459335, "grad_norm": 0.1220703125, "learning_rate": 0.0002, "loss": 1.4551, "step": 107 }, { "epoch": 0.03389697516575778, "grad_norm": 0.1171875, "learning_rate": 0.0002, "loss": 1.4532, "step": 108 }, { "epoch": 0.034210836046922204, "grad_norm": 0.125, "learning_rate": 0.0002, "loss": 1.5638, "step": 109 }, { "epoch": 0.03452469692808663, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6085, "step": 110 }, { "epoch": 0.03483855780925105, "grad_norm": 0.1259765625, "learning_rate": 0.0002, "loss": 1.4586, "step": 111 }, { "epoch": 0.03515241869041547, "grad_norm": 0.2041015625, "learning_rate": 0.0002, "loss": 1.4463, "step": 112 }, { "epoch": 0.0354662795715799, "grad_norm": 0.2373046875, "learning_rate": 0.0002, "loss": 1.6576, "step": 113 }, { "epoch": 0.03578014045274432, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.4852, "step": 114 }, { "epoch": 0.03609400133390874, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.3461, "step": 115 }, { "epoch": 0.03640786221507317, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.3754, "step": 116 }, { "epoch": 0.03672172309623759, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.5055, "step": 117 }, { "epoch": 0.03703558397740202, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5216, "step": 118 }, { "epoch": 0.03734944485856644, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.5402, "step": 119 }, { "epoch": 0.03766330573973086, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5661, "step": 120 }, { "epoch": 0.037977166620895286, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.533, "step": 121 }, { "epoch": 0.03829102750205971, "grad_norm": 0.2109375, "learning_rate": 0.0002, "loss": 1.6953, "step": 122 }, { "epoch": 0.03860488838322414, "grad_norm": 0.2236328125, "learning_rate": 0.0002, "loss": 1.3169, "step": 123 }, { "epoch": 0.03891874926438856, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.7129, "step": 124 }, { "epoch": 0.03923261014555298, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.3726, "step": 125 }, { "epoch": 0.039546471026717406, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.6792, "step": 126 }, { "epoch": 0.03986033190788183, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 2.3559, "step": 127 }, { "epoch": 0.040174192789046256, "grad_norm": 0.2314453125, "learning_rate": 0.0002, "loss": 1.5121, "step": 128 }, { "epoch": 0.04048805367021068, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.715, "step": 129 }, { "epoch": 0.0408019145513751, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 2.0793, "step": 130 }, { "epoch": 0.041115775432539525, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.7691, "step": 131 }, { "epoch": 0.04142963631370395, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 1.973, "step": 132 }, { "epoch": 0.041743497194868376, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 1.9109, "step": 133 }, { "epoch": 0.0420573580760328, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 2.0848, "step": 134 }, { "epoch": 0.04237121895719722, "grad_norm": 0.91015625, "learning_rate": 0.0002, "loss": 2.6422, "step": 135 }, { "epoch": 0.042685079838361645, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 2.3111, "step": 136 }, { "epoch": 0.04299894071952607, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 2.421, "step": 137 }, { "epoch": 0.043312801600690495, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 2.2324, "step": 138 }, { "epoch": 0.04362666248185492, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 2.1292, "step": 139 }, { "epoch": 0.04394052336301934, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 2.945, "step": 140 }, { "epoch": 0.044254384244183764, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 2.5678, "step": 141 }, { "epoch": 0.04456824512534819, "grad_norm": 1.8359375, "learning_rate": 0.0002, "loss": 2.4902, "step": 142 }, { "epoch": 0.044882106006512615, "grad_norm": 1.3515625, "learning_rate": 0.0002, "loss": 3.1968, "step": 143 }, { "epoch": 0.04519596688767704, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 2.4699, "step": 144 }, { "epoch": 0.04550982776884146, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 2.2869, "step": 145 }, { "epoch": 0.045823688650005884, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 2.2249, "step": 146 }, { "epoch": 0.04613754953117031, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 2.4014, "step": 147 }, { "epoch": 0.046451410412334734, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 2.3175, "step": 148 }, { "epoch": 0.04676527129349916, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 2.7023, "step": 149 }, { "epoch": 0.04707913217466358, "grad_norm": 1.375, "learning_rate": 0.0002, "loss": 3.2566, "step": 150 }, { "epoch": 0.047392993055828, "grad_norm": 0.185546875, "learning_rate": 0.0002, "loss": 1.4692, "step": 151 }, { "epoch": 0.04770685393699243, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.4202, "step": 152 }, { "epoch": 0.048020714818156854, "grad_norm": 0.197265625, "learning_rate": 0.0002, "loss": 1.4694, "step": 153 }, { "epoch": 0.04833457569932128, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.5723, "step": 154 }, { "epoch": 0.0486484365804857, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.5976, "step": 155 }, { "epoch": 0.04896229746165012, "grad_norm": 0.12890625, "learning_rate": 0.0002, "loss": 1.3935, "step": 156 }, { "epoch": 0.04927615834281455, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.45, "step": 157 }, { "epoch": 0.04959001922397897, "grad_norm": 0.12109375, "learning_rate": 0.0002, "loss": 1.4176, "step": 158 }, { "epoch": 0.0499038801051434, "grad_norm": 0.1171875, "learning_rate": 0.0002, "loss": 1.498, "step": 159 }, { "epoch": 0.05021774098630782, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.5622, "step": 160 }, { "epoch": 0.05053160186747224, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.5955, "step": 161 }, { "epoch": 0.05084546274863667, "grad_norm": 0.193359375, "learning_rate": 0.0002, "loss": 1.6085, "step": 162 }, { "epoch": 0.05115932362980109, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.4561, "step": 163 }, { "epoch": 0.05147318451096551, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.3197, "step": 164 }, { "epoch": 0.051787045392129936, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.4097, "step": 165 }, { "epoch": 0.05210090627329436, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.4671, "step": 166 }, { "epoch": 0.05241476715445879, "grad_norm": 0.1982421875, "learning_rate": 0.0002, "loss": 1.713, "step": 167 }, { "epoch": 0.05272862803562321, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.4736, "step": 168 }, { "epoch": 0.05304248891678763, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.5112, "step": 169 }, { "epoch": 0.053356349797952056, "grad_norm": 0.193359375, "learning_rate": 0.0002, "loss": 1.9055, "step": 170 }, { "epoch": 0.05367021067911648, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.502, "step": 171 }, { "epoch": 0.05398407156028091, "grad_norm": 0.2265625, "learning_rate": 0.0002, "loss": 1.6173, "step": 172 }, { "epoch": 0.05429793244144533, "grad_norm": 0.2451171875, "learning_rate": 0.0002, "loss": 1.6522, "step": 173 }, { "epoch": 0.05461179332260975, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.6627, "step": 174 }, { "epoch": 0.054925654203774175, "grad_norm": 0.2216796875, "learning_rate": 0.0002, "loss": 1.718, "step": 175 }, { "epoch": 0.0552395150849386, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.5639, "step": 176 }, { "epoch": 0.055553375966103026, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.8627, "step": 177 }, { "epoch": 0.05586723684726745, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 1.9256, "step": 178 }, { "epoch": 0.05618109772843187, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 1.9848, "step": 179 }, { "epoch": 0.056494958609596295, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 2.0576, "step": 180 }, { "epoch": 0.05680881949076072, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 2.289, "step": 181 }, { "epoch": 0.057122680371925146, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 2.5124, "step": 182 }, { "epoch": 0.05743654125308957, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 2.1902, "step": 183 }, { "epoch": 0.05775040213425399, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 2.3745, "step": 184 }, { "epoch": 0.058064263015418414, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 3.0241, "step": 185 }, { "epoch": 0.05837812389658284, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 2.6551, "step": 186 }, { "epoch": 0.058691984777747265, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 2.6666, "step": 187 }, { "epoch": 0.058691984777747265, "eval_loss": 1.8769614696502686, "eval_runtime": 123.4185, "eval_samples_per_second": 8.103, "eval_steps_per_second": 8.103, "step": 187 }, { "epoch": 0.058691984777747265, "mmlu_eval_accuracy": 0.44123983572722647, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5714285714285714, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, "mmlu_eval_accuracy_clinical_knowledge": 0.4482758620689655, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.36363636363636365, "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, "mmlu_eval_accuracy_college_medicine": 0.3181818181818182, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.45454545454545453, "mmlu_eval_accuracy_conceptual_physics": 0.34615384615384615, "mmlu_eval_accuracy_econometrics": 0.3333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.375, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.375, "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556, "mmlu_eval_accuracy_high_school_european_history": 0.6666666666666666, "mmlu_eval_accuracy_high_school_geography": 0.5909090909090909, "mmlu_eval_accuracy_high_school_government_and_politics": 0.7142857142857143, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3023255813953488, "mmlu_eval_accuracy_high_school_mathematics": 0.3103448275862069, "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, "mmlu_eval_accuracy_high_school_physics": 0.29411764705882354, "mmlu_eval_accuracy_high_school_psychology": 0.6833333333333333, "mmlu_eval_accuracy_high_school_statistics": 0.34782608695652173, "mmlu_eval_accuracy_high_school_us_history": 0.5909090909090909, "mmlu_eval_accuracy_high_school_world_history": 0.5, "mmlu_eval_accuracy_human_aging": 0.5652173913043478, "mmlu_eval_accuracy_human_sexuality": 0.5833333333333334, "mmlu_eval_accuracy_international_law": 0.7692307692307693, "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, "mmlu_eval_accuracy_logical_fallacies": 0.5555555555555556, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.2727272727272727, "mmlu_eval_accuracy_marketing": 0.8, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.627906976744186, "mmlu_eval_accuracy_moral_disputes": 0.42105263157894735, "mmlu_eval_accuracy_moral_scenarios": 0.24, "mmlu_eval_accuracy_nutrition": 0.5757575757575758, "mmlu_eval_accuracy_philosophy": 0.47058823529411764, "mmlu_eval_accuracy_prehistory": 0.37142857142857144, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.35294117647058826, "mmlu_eval_accuracy_professional_medicine": 0.3548387096774194, "mmlu_eval_accuracy_professional_psychology": 0.36231884057971014, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.4074074074074074, "mmlu_eval_accuracy_sociology": 0.7272727272727273, "mmlu_eval_accuracy_us_foreign_policy": 0.6363636363636364, "mmlu_eval_accuracy_virology": 0.3888888888888889, "mmlu_eval_accuracy_world_religions": 0.6842105263157895, "mmlu_loss": 0.9604393265299513, "step": 187 }, { "epoch": 0.05900584565891169, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 2.422, "step": 188 }, { "epoch": 0.05931970654007611, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 2.571, "step": 189 }, { "epoch": 0.059633567421240534, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 2.2629, "step": 190 }, { "epoch": 0.05994742830240496, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 2.8845, "step": 191 }, { "epoch": 0.060261289183569385, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 2.1545, "step": 192 }, { "epoch": 0.06057515006473381, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 2.5156, "step": 193 }, { "epoch": 0.06088901094589823, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 2.517, "step": 194 }, { "epoch": 0.06120287182706265, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 2.2571, "step": 195 }, { "epoch": 0.06151673270822708, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 2.2549, "step": 196 }, { "epoch": 0.061830593589391504, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 2.275, "step": 197 }, { "epoch": 0.06214445447055593, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 2.0545, "step": 198 }, { "epoch": 0.06245831535172035, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 3.1062, "step": 199 }, { "epoch": 0.06277217623288478, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 2.6846, "step": 200 }, { "epoch": 0.0630860371140492, "grad_norm": 0.224609375, "learning_rate": 0.0002, "loss": 1.5702, "step": 201 }, { "epoch": 0.06339989799521362, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6625, "step": 202 }, { "epoch": 0.06371375887637805, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.423, "step": 203 }, { "epoch": 0.06402761975754247, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5124, "step": 204 }, { "epoch": 0.0643414806387069, "grad_norm": 0.1845703125, "learning_rate": 0.0002, "loss": 1.499, "step": 205 }, { "epoch": 0.06465534151987132, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5965, "step": 206 }, { "epoch": 0.06496920240103574, "grad_norm": 0.119140625, "learning_rate": 0.0002, "loss": 1.3176, "step": 207 }, { "epoch": 0.06528306328220017, "grad_norm": 0.1220703125, "learning_rate": 0.0002, "loss": 1.4845, "step": 208 }, { "epoch": 0.06559692416336459, "grad_norm": 0.11669921875, "learning_rate": 0.0002, "loss": 1.3401, "step": 209 }, { "epoch": 0.06591078504452902, "grad_norm": 0.11572265625, "learning_rate": 0.0002, "loss": 1.4552, "step": 210 }, { "epoch": 0.06622464592569344, "grad_norm": 0.1328125, "learning_rate": 0.0002, "loss": 1.4667, "step": 211 }, { "epoch": 0.06653850680685786, "grad_norm": 0.1142578125, "learning_rate": 0.0002, "loss": 1.3008, "step": 212 }, { "epoch": 0.06685236768802229, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.7032, "step": 213 }, { "epoch": 0.0671662285691867, "grad_norm": 0.12255859375, "learning_rate": 0.0002, "loss": 1.4191, "step": 214 }, { "epoch": 0.06748008945035114, "grad_norm": 0.12451171875, "learning_rate": 0.0002, "loss": 1.3981, "step": 215 }, { "epoch": 0.06779395033151556, "grad_norm": 0.1181640625, "learning_rate": 0.0002, "loss": 1.3225, "step": 216 }, { "epoch": 0.06810781121267998, "grad_norm": 0.1220703125, "learning_rate": 0.0002, "loss": 1.4282, "step": 217 }, { "epoch": 0.06842167209384441, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.7299, "step": 218 }, { "epoch": 0.06873553297500883, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.4266, "step": 219 }, { "epoch": 0.06904939385617326, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.4593, "step": 220 }, { "epoch": 0.06936325473733768, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.7093, "step": 221 }, { "epoch": 0.0696771156185021, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.7773, "step": 222 }, { "epoch": 0.06999097649966653, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 1.7139, "step": 223 }, { "epoch": 0.07030483738083095, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.8133, "step": 224 }, { "epoch": 0.07061869826199538, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 1.6248, "step": 225 }, { "epoch": 0.0709325591431598, "grad_norm": 0.236328125, "learning_rate": 0.0002, "loss": 1.7408, "step": 226 }, { "epoch": 0.07124642002432421, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.7419, "step": 227 }, { "epoch": 0.07156028090548865, "grad_norm": 0.2216796875, "learning_rate": 0.0002, "loss": 1.7289, "step": 228 }, { "epoch": 0.07187414178665306, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 2.2417, "step": 229 }, { "epoch": 0.07218800266781748, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.8025, "step": 230 }, { "epoch": 0.07250186354898192, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 2.0277, "step": 231 }, { "epoch": 0.07281572443014633, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 2.6918, "step": 232 }, { "epoch": 0.07312958531131077, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 2.0573, "step": 233 }, { "epoch": 0.07344344619247518, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 2.506, "step": 234 }, { "epoch": 0.0737573070736396, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 2.009, "step": 235 }, { "epoch": 0.07407116795480403, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 2.2538, "step": 236 }, { "epoch": 0.07438502883596845, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 2.4865, "step": 237 }, { "epoch": 0.07469888971713289, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 2.3167, "step": 238 }, { "epoch": 0.0750127505982973, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 2.3738, "step": 239 }, { "epoch": 0.07532661147946172, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 2.0565, "step": 240 }, { "epoch": 0.07564047236062615, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 2.5195, "step": 241 }, { "epoch": 0.07595433324179057, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 2.3961, "step": 242 }, { "epoch": 0.076268194122955, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 2.8956, "step": 243 }, { "epoch": 0.07658205500411942, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 2.5885, "step": 244 }, { "epoch": 0.07689591588528384, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 1.8268, "step": 245 }, { "epoch": 0.07720977676644827, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.9894, "step": 246 }, { "epoch": 0.07752363764761269, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 2.0273, "step": 247 }, { "epoch": 0.07783749852877712, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 2.2477, "step": 248 }, { "epoch": 0.07815135940994154, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 2.5224, "step": 249 }, { "epoch": 0.07846522029110596, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 3.2266, "step": 250 }, { "epoch": 0.0787790811722704, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.4904, "step": 251 }, { "epoch": 0.07909294205343481, "grad_norm": 0.2119140625, "learning_rate": 0.0002, "loss": 1.4279, "step": 252 }, { "epoch": 0.07940680293459924, "grad_norm": 0.19140625, "learning_rate": 0.0002, "loss": 1.5421, "step": 253 }, { "epoch": 0.07972066381576366, "grad_norm": 0.216796875, "learning_rate": 0.0002, "loss": 1.5092, "step": 254 }, { "epoch": 0.08003452469692808, "grad_norm": 0.2275390625, "learning_rate": 0.0002, "loss": 1.5434, "step": 255 }, { "epoch": 0.08034838557809251, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.4732, "step": 256 }, { "epoch": 0.08066224645925693, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.4807, "step": 257 }, { "epoch": 0.08097610734042136, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.5032, "step": 258 }, { "epoch": 0.08128996822158578, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5402, "step": 259 }, { "epoch": 0.0816038291027502, "grad_norm": 0.1259765625, "learning_rate": 0.0002, "loss": 1.4766, "step": 260 }, { "epoch": 0.08191768998391463, "grad_norm": 0.1328125, "learning_rate": 0.0002, "loss": 1.4982, "step": 261 }, { "epoch": 0.08223155086507905, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.3532, "step": 262 }, { "epoch": 0.08254541174624348, "grad_norm": 0.11572265625, "learning_rate": 0.0002, "loss": 1.3225, "step": 263 }, { "epoch": 0.0828592726274079, "grad_norm": 0.1259765625, "learning_rate": 0.0002, "loss": 1.4933, "step": 264 }, { "epoch": 0.08317313350857232, "grad_norm": 0.19921875, "learning_rate": 0.0002, "loss": 1.4604, "step": 265 }, { "epoch": 0.08348699438973675, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.337, "step": 266 }, { "epoch": 0.08380085527090117, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.465, "step": 267 }, { "epoch": 0.0841147161520656, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 1.5809, "step": 268 }, { "epoch": 0.08442857703323002, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 1.921, "step": 269 }, { "epoch": 0.08474243791439444, "grad_norm": 0.248046875, "learning_rate": 0.0002, "loss": 1.5733, "step": 270 }, { "epoch": 0.08505629879555887, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.5355, "step": 271 }, { "epoch": 0.08537015967672329, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 1.6797, "step": 272 }, { "epoch": 0.08568402055788772, "grad_norm": 0.248046875, "learning_rate": 0.0002, "loss": 1.6325, "step": 273 }, { "epoch": 0.08599788143905214, "grad_norm": 0.205078125, "learning_rate": 0.0002, "loss": 1.5476, "step": 274 }, { "epoch": 0.08631174232021656, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.6788, "step": 275 }, { "epoch": 0.08662560320138099, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 2.0317, "step": 276 }, { "epoch": 0.08693946408254541, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 1.8197, "step": 277 }, { "epoch": 0.08725332496370984, "grad_norm": 0.2373046875, "learning_rate": 0.0002, "loss": 1.7157, "step": 278 }, { "epoch": 0.08756718584487426, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 2.2781, "step": 279 }, { "epoch": 0.08788104672603868, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 2.1994, "step": 280 }, { "epoch": 0.08819490760720311, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 2.1835, "step": 281 }, { "epoch": 0.08850876848836753, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 1.9166, "step": 282 }, { "epoch": 0.08882262936953196, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 2.1354, "step": 283 }, { "epoch": 0.08913649025069638, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 2.3373, "step": 284 }, { "epoch": 0.0894503511318608, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 2.5996, "step": 285 }, { "epoch": 0.08976421201302523, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.9957, "step": 286 }, { "epoch": 0.09007807289418965, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 2.4719, "step": 287 }, { "epoch": 0.09039193377535408, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 2.1207, "step": 288 }, { "epoch": 0.0907057946565185, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 2.365, "step": 289 }, { "epoch": 0.09101965553768292, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 2.3941, "step": 290 }, { "epoch": 0.09133351641884735, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 2.1109, "step": 291 }, { "epoch": 0.09164737730001177, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 2.3109, "step": 292 }, { "epoch": 0.0919612381811762, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 2.3769, "step": 293 }, { "epoch": 0.09227509906234062, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 2.5643, "step": 294 }, { "epoch": 0.09258895994350504, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 2.6575, "step": 295 }, { "epoch": 0.09290282082466947, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 2.0201, "step": 296 }, { "epoch": 0.09321668170583389, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.6622, "step": 297 }, { "epoch": 0.09353054258699832, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 2.7948, "step": 298 }, { "epoch": 0.09384440346816274, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 2.6774, "step": 299 }, { "epoch": 0.09415826434932716, "grad_norm": 1.2265625, "learning_rate": 0.0002, "loss": 3.0224, "step": 300 }, { "epoch": 0.09447212523049159, "grad_norm": 0.10791015625, "learning_rate": 0.0002, "loss": 1.4284, "step": 301 }, { "epoch": 0.094785986111656, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.4059, "step": 302 }, { "epoch": 0.09509984699282044, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.4115, "step": 303 }, { "epoch": 0.09541370787398486, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.4218, "step": 304 }, { "epoch": 0.09572756875514928, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.4053, "step": 305 }, { "epoch": 0.09604142963631371, "grad_norm": 0.240234375, "learning_rate": 0.0002, "loss": 1.6277, "step": 306 }, { "epoch": 0.09635529051747813, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.3927, "step": 307 }, { "epoch": 0.09666915139864256, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.3535, "step": 308 }, { "epoch": 0.09698301227980698, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.3208, "step": 309 }, { "epoch": 0.0972968731609714, "grad_norm": 0.12109375, "learning_rate": 0.0002, "loss": 1.4483, "step": 310 }, { "epoch": 0.09761073404213583, "grad_norm": 0.234375, "learning_rate": 0.0002, "loss": 1.8075, "step": 311 }, { "epoch": 0.09792459492330025, "grad_norm": 0.10595703125, "learning_rate": 0.0002, "loss": 1.2494, "step": 312 }, { "epoch": 0.09823845580446468, "grad_norm": 0.1259765625, "learning_rate": 0.0002, "loss": 1.4753, "step": 313 }, { "epoch": 0.0985523166856291, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5693, "step": 314 }, { "epoch": 0.09886617756679351, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.441, "step": 315 }, { "epoch": 0.09918003844795795, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6538, "step": 316 }, { "epoch": 0.09949389932912237, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5327, "step": 317 }, { "epoch": 0.0998077602102868, "grad_norm": 0.12255859375, "learning_rate": 0.0002, "loss": 1.2743, "step": 318 }, { "epoch": 0.10012162109145122, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.282, "step": 319 }, { "epoch": 0.10043548197261563, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.2735, "step": 320 }, { "epoch": 0.10074934285378007, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.5729, "step": 321 }, { "epoch": 0.10106320373494448, "grad_norm": 0.216796875, "learning_rate": 0.0002, "loss": 1.4801, "step": 322 }, { "epoch": 0.1013770646161089, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.6091, "step": 323 }, { "epoch": 0.10169092549727334, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 1.8584, "step": 324 }, { "epoch": 0.10200478637843775, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.507, "step": 325 }, { "epoch": 0.10231864725960219, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 1.6124, "step": 326 }, { "epoch": 0.1026325081407666, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.7112, "step": 327 }, { "epoch": 0.10294636902193102, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 2.1519, "step": 328 }, { "epoch": 0.10326022990309545, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 1.941, "step": 329 }, { "epoch": 0.10357409078425987, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 1.8767, "step": 330 }, { "epoch": 0.1038879516654243, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.9287, "step": 331 }, { "epoch": 0.10420181254658872, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.7912, "step": 332 }, { "epoch": 0.10451567342775314, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 2.8166, "step": 333 }, { "epoch": 0.10482953430891757, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 2.3756, "step": 334 }, { "epoch": 0.10514339519008199, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.9608, "step": 335 }, { "epoch": 0.10545725607124642, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 2.2499, "step": 336 }, { "epoch": 0.10577111695241084, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 2.3659, "step": 337 }, { "epoch": 0.10608497783357526, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 2.304, "step": 338 }, { "epoch": 0.1063988387147397, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 2.0312, "step": 339 }, { "epoch": 0.10671269959590411, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 2.2391, "step": 340 }, { "epoch": 0.10702656047706854, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 2.0306, "step": 341 }, { "epoch": 0.10734042135823296, "grad_norm": 1.8828125, "learning_rate": 0.0002, "loss": 2.5958, "step": 342 }, { "epoch": 0.10765428223939738, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 2.9164, "step": 343 }, { "epoch": 0.10796814312056181, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 2.3001, "step": 344 }, { "epoch": 0.10828200400172623, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 2.5496, "step": 345 }, { "epoch": 0.10859586488289066, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 2.1181, "step": 346 }, { "epoch": 0.10890972576405508, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 2.3313, "step": 347 }, { "epoch": 0.1092235866452195, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 2.5624, "step": 348 }, { "epoch": 0.10953744752638393, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 2.3083, "step": 349 }, { "epoch": 0.10985130840754835, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 3.233, "step": 350 }, { "epoch": 0.11016516928871278, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.3294, "step": 351 }, { "epoch": 0.1104790301698772, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.5399, "step": 352 }, { "epoch": 0.11079289105104162, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.4208, "step": 353 }, { "epoch": 0.11110675193220605, "grad_norm": 0.1318359375, "learning_rate": 0.0002, "loss": 1.4345, "step": 354 }, { "epoch": 0.11142061281337047, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.424, "step": 355 }, { "epoch": 0.1117344736945349, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.6212, "step": 356 }, { "epoch": 0.11204833457569932, "grad_norm": 0.2255859375, "learning_rate": 0.0002, "loss": 1.4564, "step": 357 }, { "epoch": 0.11236219545686374, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5485, "step": 358 }, { "epoch": 0.11267605633802817, "grad_norm": 0.10498046875, "learning_rate": 0.0002, "loss": 1.2384, "step": 359 }, { "epoch": 0.11298991721919259, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.4083, "step": 360 }, { "epoch": 0.11330377810035702, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.3978, "step": 361 }, { "epoch": 0.11361763898152144, "grad_norm": 0.236328125, "learning_rate": 0.0002, "loss": 1.4216, "step": 362 }, { "epoch": 0.11393149986268586, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 1.5479, "step": 363 }, { "epoch": 0.11424536074385029, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.4693, "step": 364 }, { "epoch": 0.11455922162501471, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.4133, "step": 365 }, { "epoch": 0.11487308250617914, "grad_norm": 0.12890625, "learning_rate": 0.0002, "loss": 1.1875, "step": 366 }, { "epoch": 0.11518694338734356, "grad_norm": 0.24609375, "learning_rate": 0.0002, "loss": 1.5695, "step": 367 }, { "epoch": 0.11550080426850798, "grad_norm": 0.12060546875, "learning_rate": 0.0002, "loss": 1.1122, "step": 368 }, { "epoch": 0.11581466514967241, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.5259, "step": 369 }, { "epoch": 0.11612852603083683, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.4507, "step": 370 }, { "epoch": 0.11644238691200126, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 1.6836, "step": 371 }, { "epoch": 0.11675624779316568, "grad_norm": 0.2236328125, "learning_rate": 0.0002, "loss": 1.6754, "step": 372 }, { "epoch": 0.1170701086743301, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.4691, "step": 373 }, { "epoch": 0.11738396955549453, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 1.7798, "step": 374 }, { "epoch": 0.11738396955549453, "eval_loss": 1.8649098873138428, "eval_runtime": 144.7095, "eval_samples_per_second": 6.91, "eval_steps_per_second": 6.91, "step": 374 }, { "epoch": 0.11738396955549453, "mmlu_eval_accuracy": 0.43153515218737043, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.3125, "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, "mmlu_eval_accuracy_clinical_knowledge": 0.41379310344827586, "mmlu_eval_accuracy_college_biology": 0.1875, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.45454545454545453, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, "mmlu_eval_accuracy_college_physics": 0.5454545454545454, "mmlu_eval_accuracy_computer_security": 0.45454545454545453, "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, "mmlu_eval_accuracy_econometrics": 0.25, "mmlu_eval_accuracy_electrical_engineering": 0.4375, "mmlu_eval_accuracy_elementary_mathematics": 0.36585365853658536, "mmlu_eval_accuracy_formal_logic": 0.35714285714285715, "mmlu_eval_accuracy_global_facts": 0.0, "mmlu_eval_accuracy_high_school_biology": 0.34375, "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365, "mmlu_eval_accuracy_high_school_computer_science": 0.4444444444444444, "mmlu_eval_accuracy_high_school_european_history": 0.6111111111111112, "mmlu_eval_accuracy_high_school_geography": 0.6363636363636364, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5714285714285714, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3023255813953488, "mmlu_eval_accuracy_high_school_mathematics": 0.27586206896551724, "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, "mmlu_eval_accuracy_high_school_physics": 0.29411764705882354, "mmlu_eval_accuracy_high_school_psychology": 0.6666666666666666, "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, "mmlu_eval_accuracy_high_school_us_history": 0.5909090909090909, "mmlu_eval_accuracy_high_school_world_history": 0.5769230769230769, "mmlu_eval_accuracy_human_aging": 0.6521739130434783, "mmlu_eval_accuracy_human_sexuality": 0.5833333333333334, "mmlu_eval_accuracy_international_law": 0.7692307692307693, "mmlu_eval_accuracy_jurisprudence": 0.45454545454545453, "mmlu_eval_accuracy_logical_fallacies": 0.5555555555555556, "mmlu_eval_accuracy_machine_learning": 0.18181818181818182, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.8, "mmlu_eval_accuracy_medical_genetics": 0.8181818181818182, "mmlu_eval_accuracy_miscellaneous": 0.5930232558139535, "mmlu_eval_accuracy_moral_disputes": 0.39473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.24, "mmlu_eval_accuracy_nutrition": 0.45454545454545453, "mmlu_eval_accuracy_philosophy": 0.47058823529411764, "mmlu_eval_accuracy_prehistory": 0.4, "mmlu_eval_accuracy_professional_accounting": 0.3548387096774194, "mmlu_eval_accuracy_professional_law": 0.35294117647058826, "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, "mmlu_eval_accuracy_professional_psychology": 0.34782608695652173, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.4444444444444444, "mmlu_eval_accuracy_sociology": 0.5454545454545454, "mmlu_eval_accuracy_us_foreign_policy": 0.7272727272727273, "mmlu_eval_accuracy_virology": 0.3888888888888889, "mmlu_eval_accuracy_world_religions": 0.7368421052631579, "mmlu_loss": 1.4904409643094731, "step": 374 }, { "epoch": 0.11769783043665895, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.6591, "step": 375 }, { "epoch": 0.11801169131782338, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.9671, "step": 376 }, { "epoch": 0.1183255521989878, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.9548, "step": 377 }, { "epoch": 0.11863941308015222, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.869, "step": 378 }, { "epoch": 0.11895327396131665, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 2.1079, "step": 379 }, { "epoch": 0.11926713484248107, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 2.2094, "step": 380 }, { "epoch": 0.1195809957236455, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 2.0853, "step": 381 }, { "epoch": 0.11989485660480992, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 2.1766, "step": 382 }, { "epoch": 0.12020871748597434, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.873, "step": 383 }, { "epoch": 0.12052257836713877, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 2.6411, "step": 384 }, { "epoch": 0.12083643924830319, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 2.0356, "step": 385 }, { "epoch": 0.12115030012946762, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 2.3583, "step": 386 }, { "epoch": 0.12146416101063204, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 2.0772, "step": 387 }, { "epoch": 0.12177802189179646, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 2.1947, "step": 388 }, { "epoch": 0.12209188277296089, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 2.741, "step": 389 }, { "epoch": 0.1224057436541253, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 2.6193, "step": 390 }, { "epoch": 0.12271960453528974, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.921, "step": 391 }, { "epoch": 0.12303346541645416, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.7521, "step": 392 }, { "epoch": 0.12334732629761858, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 2.1829, "step": 393 }, { "epoch": 0.12366118717878301, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 2.3879, "step": 394 }, { "epoch": 0.12397504805994743, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 2.2603, "step": 395 }, { "epoch": 0.12428890894111186, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 2.0838, "step": 396 }, { "epoch": 0.12460276982227628, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 1.8756, "step": 397 }, { "epoch": 0.1249166307034407, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 2.1248, "step": 398 }, { "epoch": 0.1252304915846051, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 2.3589, "step": 399 }, { "epoch": 0.12554435246576956, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 2.4052, "step": 400 }, { "epoch": 0.12585821334693398, "grad_norm": 0.12060546875, "learning_rate": 0.0002, "loss": 1.3253, "step": 401 }, { "epoch": 0.1261720742280984, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.5794, "step": 402 }, { "epoch": 0.12648593510926281, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.5086, "step": 403 }, { "epoch": 0.12679979599042723, "grad_norm": 0.12890625, "learning_rate": 0.0002, "loss": 1.4157, "step": 404 }, { "epoch": 0.12711365687159168, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.402, "step": 405 }, { "epoch": 0.1274275177527561, "grad_norm": 0.115234375, "learning_rate": 0.0002, "loss": 1.5706, "step": 406 }, { "epoch": 0.12774137863392052, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.441, "step": 407 }, { "epoch": 0.12805523951508493, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.4242, "step": 408 }, { "epoch": 0.12836910039624935, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.3529, "step": 409 }, { "epoch": 0.1286829612774138, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5687, "step": 410 }, { "epoch": 0.12899682215857822, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.6228, "step": 411 }, { "epoch": 0.12931068303974264, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 1.5197, "step": 412 }, { "epoch": 0.12962454392090705, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5823, "step": 413 }, { "epoch": 0.12993840480207147, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5927, "step": 414 }, { "epoch": 0.13025226568323592, "grad_norm": 0.244140625, "learning_rate": 0.0002, "loss": 1.481, "step": 415 }, { "epoch": 0.13056612656440034, "grad_norm": 0.2490234375, "learning_rate": 0.0002, "loss": 1.696, "step": 416 }, { "epoch": 0.13087998744556475, "grad_norm": 0.185546875, "learning_rate": 0.0002, "loss": 1.4169, "step": 417 }, { "epoch": 0.13119384832672917, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.6478, "step": 418 }, { "epoch": 0.1315077092078936, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.5207, "step": 419 }, { "epoch": 0.13182157008905804, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.3612, "step": 420 }, { "epoch": 0.13213543097022246, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.5294, "step": 421 }, { "epoch": 0.13244929185138687, "grad_norm": 0.240234375, "learning_rate": 0.0002, "loss": 1.5707, "step": 422 }, { "epoch": 0.1327631527325513, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.6781, "step": 423 }, { "epoch": 0.1330770136137157, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.5306, "step": 424 }, { "epoch": 0.13339087449488016, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.6768, "step": 425 }, { "epoch": 0.13370473537604458, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 1.7467, "step": 426 }, { "epoch": 0.134018596257209, "grad_norm": 0.2236328125, "learning_rate": 0.0002, "loss": 1.5438, "step": 427 }, { "epoch": 0.1343324571383734, "grad_norm": 0.224609375, "learning_rate": 0.0002, "loss": 1.4618, "step": 428 }, { "epoch": 0.13464631801953783, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 1.926, "step": 429 }, { "epoch": 0.13496017890070228, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 2.0031, "step": 430 }, { "epoch": 0.1352740397818667, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 2.17, "step": 431 }, { "epoch": 0.1355879006630311, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 2.0235, "step": 432 }, { "epoch": 0.13590176154419553, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 2.2579, "step": 433 }, { "epoch": 0.13621562242535995, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 2.141, "step": 434 }, { "epoch": 0.1365294833065244, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 2.2934, "step": 435 }, { "epoch": 0.13684334418768881, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 2.5946, "step": 436 }, { "epoch": 0.13715720506885323, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 2.1508, "step": 437 }, { "epoch": 0.13747106595001765, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 2.1596, "step": 438 }, { "epoch": 0.13778492683118207, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 2.1806, "step": 439 }, { "epoch": 0.13809878771234652, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 2.7799, "step": 440 }, { "epoch": 0.13841264859351093, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.95, "step": 441 }, { "epoch": 0.13872650947467535, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 1.7952, "step": 442 }, { "epoch": 0.13904037035583977, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 2.338, "step": 443 }, { "epoch": 0.1393542312370042, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.9228, "step": 444 }, { "epoch": 0.13966809211816864, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 2.6961, "step": 445 }, { "epoch": 0.13998195299933305, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 2.2991, "step": 446 }, { "epoch": 0.14029581388049747, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 2.3858, "step": 447 }, { "epoch": 0.1406096747616619, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 2.1153, "step": 448 }, { "epoch": 0.1409235356428263, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 2.3709, "step": 449 }, { "epoch": 0.14123739652399075, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 2.7573, "step": 450 }, { "epoch": 0.14155125740515517, "grad_norm": 0.1884765625, "learning_rate": 0.0002, "loss": 1.489, "step": 451 }, { "epoch": 0.1418651182863196, "grad_norm": 0.1923828125, "learning_rate": 0.0002, "loss": 1.4093, "step": 452 }, { "epoch": 0.142178979167484, "grad_norm": 0.2451171875, "learning_rate": 0.0002, "loss": 1.6179, "step": 453 }, { "epoch": 0.14249284004864843, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.56, "step": 454 }, { "epoch": 0.14280670092981287, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.4018, "step": 455 }, { "epoch": 0.1431205618109773, "grad_norm": 0.1142578125, "learning_rate": 0.0002, "loss": 1.4601, "step": 456 }, { "epoch": 0.1434344226921417, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5944, "step": 457 }, { "epoch": 0.14374828357330613, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.4949, "step": 458 }, { "epoch": 0.14406214445447055, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.4648, "step": 459 }, { "epoch": 0.14437600533563497, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.544, "step": 460 }, { "epoch": 0.1446898662167994, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.3839, "step": 461 }, { "epoch": 0.14500372709796383, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.4504, "step": 462 }, { "epoch": 0.14531758797912825, "grad_norm": 0.2236328125, "learning_rate": 0.0002, "loss": 1.4584, "step": 463 }, { "epoch": 0.14563144886029267, "grad_norm": 0.1142578125, "learning_rate": 0.0002, "loss": 1.427, "step": 464 }, { "epoch": 0.14594530974145709, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.451, "step": 465 }, { "epoch": 0.14625917062262153, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.4653, "step": 466 }, { "epoch": 0.14657303150378595, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5181, "step": 467 }, { "epoch": 0.14688689238495037, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.3313, "step": 468 }, { "epoch": 0.1472007532661148, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.5084, "step": 469 }, { "epoch": 0.1475146141472792, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.5112, "step": 470 }, { "epoch": 0.14782847502844365, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.5859, "step": 471 }, { "epoch": 0.14814233590960807, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.4651, "step": 472 }, { "epoch": 0.1484561967907725, "grad_norm": 0.2470703125, "learning_rate": 0.0002, "loss": 1.9876, "step": 473 }, { "epoch": 0.1487700576719369, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 1.4273, "step": 474 }, { "epoch": 0.14908391855310132, "grad_norm": 0.236328125, "learning_rate": 0.0002, "loss": 1.7127, "step": 475 }, { "epoch": 0.14939777943426577, "grad_norm": 0.2109375, "learning_rate": 0.0002, "loss": 1.6046, "step": 476 }, { "epoch": 0.1497116403154302, "grad_norm": 0.248046875, "learning_rate": 0.0002, "loss": 1.8459, "step": 477 }, { "epoch": 0.1500255011965946, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 1.656, "step": 478 }, { "epoch": 0.15033936207775903, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 2.6007, "step": 479 }, { "epoch": 0.15065322295892344, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.8374, "step": 480 }, { "epoch": 0.1509670838400879, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.9358, "step": 481 }, { "epoch": 0.1512809447212523, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 2.38, "step": 482 }, { "epoch": 0.15159480560241673, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 2.0626, "step": 483 }, { "epoch": 0.15190866648358115, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 2.383, "step": 484 }, { "epoch": 0.15222252736474556, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 2.1199, "step": 485 }, { "epoch": 0.15253638824591, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 2.2974, "step": 486 }, { "epoch": 0.15285024912707443, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 2.605, "step": 487 }, { "epoch": 0.15316411000823885, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 2.3385, "step": 488 }, { "epoch": 0.15347797088940326, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 2.0768, "step": 489 }, { "epoch": 0.15379183177056768, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 2.1478, "step": 490 }, { "epoch": 0.15410569265173213, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 2.6997, "step": 491 }, { "epoch": 0.15441955353289655, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 1.8556, "step": 492 }, { "epoch": 0.15473341441406097, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 2.7356, "step": 493 }, { "epoch": 0.15504727529522538, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 2.4964, "step": 494 }, { "epoch": 0.1553611361763898, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 2.2954, "step": 495 }, { "epoch": 0.15567499705755425, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 2.0918, "step": 496 }, { "epoch": 0.15598885793871867, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.8699, "step": 497 }, { "epoch": 0.15630271881988309, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 2.4207, "step": 498 }, { "epoch": 0.1566165797010475, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 2.2597, "step": 499 }, { "epoch": 0.15693044058221192, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 3.121, "step": 500 }, { "epoch": 0.15724430146337637, "grad_norm": 0.11181640625, "learning_rate": 0.0002, "loss": 1.4077, "step": 501 }, { "epoch": 0.1575581623445408, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.5515, "step": 502 }, { "epoch": 0.1578720232257052, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.4571, "step": 503 }, { "epoch": 0.15818588410686962, "grad_norm": 0.197265625, "learning_rate": 0.0002, "loss": 1.4592, "step": 504 }, { "epoch": 0.15849974498803404, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.4867, "step": 505 }, { "epoch": 0.1588136058691985, "grad_norm": 0.11376953125, "learning_rate": 0.0002, "loss": 1.3015, "step": 506 }, { "epoch": 0.1591274667503629, "grad_norm": 0.11767578125, "learning_rate": 0.0002, "loss": 1.4929, "step": 507 }, { "epoch": 0.15944132763152732, "grad_norm": 0.109375, "learning_rate": 0.0002, "loss": 1.407, "step": 508 }, { "epoch": 0.15975518851269174, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.5229, "step": 509 }, { "epoch": 0.16006904939385616, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.5365, "step": 510 }, { "epoch": 0.1603829102750206, "grad_norm": 0.10986328125, "learning_rate": 0.0002, "loss": 1.3955, "step": 511 }, { "epoch": 0.16069677115618503, "grad_norm": 0.10400390625, "learning_rate": 0.0002, "loss": 1.1882, "step": 512 }, { "epoch": 0.16101063203734944, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.518, "step": 513 }, { "epoch": 0.16132449291851386, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.3745, "step": 514 }, { "epoch": 0.16163835379967828, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6599, "step": 515 }, { "epoch": 0.16195221468084273, "grad_norm": 0.12158203125, "learning_rate": 0.0002, "loss": 1.4509, "step": 516 }, { "epoch": 0.16226607556200714, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.7153, "step": 517 }, { "epoch": 0.16257993644317156, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.5206, "step": 518 }, { "epoch": 0.16289379732433598, "grad_norm": 0.2197265625, "learning_rate": 0.0002, "loss": 1.627, "step": 519 }, { "epoch": 0.1632076582055004, "grad_norm": 0.1318359375, "learning_rate": 0.0002, "loss": 1.3938, "step": 520 }, { "epoch": 0.16352151908666485, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.4612, "step": 521 }, { "epoch": 0.16383537996782926, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.4233, "step": 522 }, { "epoch": 0.16414924084899368, "grad_norm": 0.197265625, "learning_rate": 0.0002, "loss": 1.4416, "step": 523 }, { "epoch": 0.1644631017301581, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 2.0068, "step": 524 }, { "epoch": 0.16477696261132252, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.4914, "step": 525 }, { "epoch": 0.16509082349248697, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.7713, "step": 526 }, { "epoch": 0.16540468437365138, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 1.949, "step": 527 }, { "epoch": 0.1657185452548158, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.9683, "step": 528 }, { "epoch": 0.16603240613598022, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 2.2225, "step": 529 }, { "epoch": 0.16634626701714464, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 2.2496, "step": 530 }, { "epoch": 0.16666012789830909, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 2.4913, "step": 531 }, { "epoch": 0.1669739887794735, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 2.0728, "step": 532 }, { "epoch": 0.16728784966063792, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 2.7206, "step": 533 }, { "epoch": 0.16760171054180234, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 2.1716, "step": 534 }, { "epoch": 0.16791557142296676, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 2.1605, "step": 535 }, { "epoch": 0.1682294323041312, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 2.0504, "step": 536 }, { "epoch": 0.16854329318529562, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.9017, "step": 537 }, { "epoch": 0.16885715406646004, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 2.2211, "step": 538 }, { "epoch": 0.16917101494762446, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 2.132, "step": 539 }, { "epoch": 0.16948487582878888, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 2.0407, "step": 540 }, { "epoch": 0.16979873670995332, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 2.3361, "step": 541 }, { "epoch": 0.17011259759111774, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 2.0002, "step": 542 }, { "epoch": 0.17042645847228216, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 2.2588, "step": 543 }, { "epoch": 0.17074031935344658, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 2.4182, "step": 544 }, { "epoch": 0.171054180234611, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 2.4358, "step": 545 }, { "epoch": 0.17136804111577544, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 2.3013, "step": 546 }, { "epoch": 0.17168190199693986, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 2.2304, "step": 547 }, { "epoch": 0.17199576287810428, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 2.9786, "step": 548 }, { "epoch": 0.1723096237592687, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 2.5713, "step": 549 }, { "epoch": 0.17262348464043312, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 2.4127, "step": 550 }, { "epoch": 0.17293734552159756, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.4719, "step": 551 }, { "epoch": 0.17325120640276198, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.4504, "step": 552 }, { "epoch": 0.1735650672839264, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.4127, "step": 553 }, { "epoch": 0.17387892816509082, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5099, "step": 554 }, { "epoch": 0.17419278904625524, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.6744, "step": 555 }, { "epoch": 0.17450664992741968, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.4517, "step": 556 }, { "epoch": 0.1748205108085841, "grad_norm": 0.119140625, "learning_rate": 0.0002, "loss": 1.4458, "step": 557 }, { "epoch": 0.17513437168974852, "grad_norm": 0.1123046875, "learning_rate": 0.0002, "loss": 1.3115, "step": 558 }, { "epoch": 0.17544823257091294, "grad_norm": 0.12890625, "learning_rate": 0.0002, "loss": 1.4375, "step": 559 }, { "epoch": 0.17576209345207736, "grad_norm": 0.10595703125, "learning_rate": 0.0002, "loss": 1.3176, "step": 560 }, { "epoch": 0.1760759543332418, "grad_norm": 0.1318359375, "learning_rate": 0.0002, "loss": 1.2423, "step": 561 }, { "epoch": 0.1760759543332418, "eval_loss": 1.8500475883483887, "eval_runtime": 123.2087, "eval_samples_per_second": 8.116, "eval_steps_per_second": 8.116, "step": 561 }, { "epoch": 0.1760759543332418, "mmlu_eval_accuracy": 0.42976112199050265, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.4375, "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, "mmlu_eval_accuracy_clinical_knowledge": 0.3448275862068966, "mmlu_eval_accuracy_college_biology": 0.1875, "mmlu_eval_accuracy_college_chemistry": 0.25, "mmlu_eval_accuracy_college_computer_science": 0.45454545454545453, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.45454545454545453, "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.4375, "mmlu_eval_accuracy_elementary_mathematics": 0.36585365853658536, "mmlu_eval_accuracy_formal_logic": 0.2857142857142857, "mmlu_eval_accuracy_global_facts": 0.0, "mmlu_eval_accuracy_high_school_biology": 0.34375, "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, "mmlu_eval_accuracy_high_school_computer_science": 0.4444444444444444, "mmlu_eval_accuracy_high_school_european_history": 0.6111111111111112, "mmlu_eval_accuracy_high_school_geography": 0.5, "mmlu_eval_accuracy_high_school_government_and_politics": 0.6190476190476191, "mmlu_eval_accuracy_high_school_macroeconomics": 0.27906976744186046, "mmlu_eval_accuracy_high_school_mathematics": 0.3448275862068966, "mmlu_eval_accuracy_high_school_microeconomics": 0.2692307692307692, "mmlu_eval_accuracy_high_school_physics": 0.29411764705882354, "mmlu_eval_accuracy_high_school_psychology": 0.65, "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, "mmlu_eval_accuracy_high_school_us_history": 0.5909090909090909, "mmlu_eval_accuracy_high_school_world_history": 0.5384615384615384, "mmlu_eval_accuracy_human_aging": 0.7391304347826086, "mmlu_eval_accuracy_human_sexuality": 0.5833333333333334, "mmlu_eval_accuracy_international_law": 0.7692307692307693, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.6111111111111112, "mmlu_eval_accuracy_machine_learning": 0.18181818181818182, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.84, "mmlu_eval_accuracy_medical_genetics": 0.8181818181818182, "mmlu_eval_accuracy_miscellaneous": 0.6046511627906976, "mmlu_eval_accuracy_moral_disputes": 0.39473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.23, "mmlu_eval_accuracy_nutrition": 0.45454545454545453, "mmlu_eval_accuracy_philosophy": 0.47058823529411764, "mmlu_eval_accuracy_prehistory": 0.2857142857142857, "mmlu_eval_accuracy_professional_accounting": 0.3548387096774194, "mmlu_eval_accuracy_professional_law": 0.3352941176470588, "mmlu_eval_accuracy_professional_medicine": 0.45161290322580644, "mmlu_eval_accuracy_professional_psychology": 0.34782608695652173, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.4074074074074074, "mmlu_eval_accuracy_sociology": 0.5909090909090909, "mmlu_eval_accuracy_us_foreign_policy": 0.7272727272727273, "mmlu_eval_accuracy_virology": 0.5555555555555556, "mmlu_eval_accuracy_world_religions": 0.7368421052631579, "mmlu_loss": 1.3155592114687122, "step": 561 }, { "epoch": 0.17638981521440622, "grad_norm": 0.125, "learning_rate": 0.0002, "loss": 1.5559, "step": 562 }, { "epoch": 0.17670367609557064, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5174, "step": 563 }, { "epoch": 0.17701753697673506, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.4279, "step": 564 }, { "epoch": 0.17733139785789948, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5815, "step": 565 }, { "epoch": 0.17764525873906392, "grad_norm": 0.203125, "learning_rate": 0.0002, "loss": 1.6468, "step": 566 }, { "epoch": 0.17795911962022834, "grad_norm": 0.2255859375, "learning_rate": 0.0002, "loss": 1.6738, "step": 567 }, { "epoch": 0.17827298050139276, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.4164, "step": 568 }, { "epoch": 0.17858684138255718, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.4132, "step": 569 }, { "epoch": 0.1789007022637216, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.5096, "step": 570 }, { "epoch": 0.17921456314488604, "grad_norm": 0.2333984375, "learning_rate": 0.0002, "loss": 1.762, "step": 571 }, { "epoch": 0.17952842402605046, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.6598, "step": 572 }, { "epoch": 0.17984228490721488, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.4627, "step": 573 }, { "epoch": 0.1801561457883793, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 1.8129, "step": 574 }, { "epoch": 0.18047000666954371, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.5752, "step": 575 }, { "epoch": 0.18078386755070816, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 1.8668, "step": 576 }, { "epoch": 0.18109772843187258, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 1.8953, "step": 577 }, { "epoch": 0.181411589313037, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 2.1257, "step": 578 }, { "epoch": 0.18172545019420142, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.8638, "step": 579 }, { "epoch": 0.18203931107536583, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 1.9372, "step": 580 }, { "epoch": 0.18235317195653028, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.8314, "step": 581 }, { "epoch": 0.1826670328376947, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 1.9337, "step": 582 }, { "epoch": 0.18298089371885912, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 2.2673, "step": 583 }, { "epoch": 0.18329475460002354, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 2.1699, "step": 584 }, { "epoch": 0.18360861548118795, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 2.3086, "step": 585 }, { "epoch": 0.1839224763623524, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 2.2748, "step": 586 }, { "epoch": 0.18423633724351682, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 2.226, "step": 587 }, { "epoch": 0.18455019812468124, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.8794, "step": 588 }, { "epoch": 0.18486405900584565, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 2.7864, "step": 589 }, { "epoch": 0.18517791988701007, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 2.5467, "step": 590 }, { "epoch": 0.18549178076817452, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 2.1094, "step": 591 }, { "epoch": 0.18580564164933894, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 2.2149, "step": 592 }, { "epoch": 0.18611950253050336, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 2.5329, "step": 593 }, { "epoch": 0.18643336341166777, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 2.8252, "step": 594 }, { "epoch": 0.1867472242928322, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 2.2788, "step": 595 }, { "epoch": 0.18706108517399664, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 2.0439, "step": 596 }, { "epoch": 0.18737494605516106, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.8777, "step": 597 }, { "epoch": 0.18768880693632548, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 2.5919, "step": 598 }, { "epoch": 0.1880026678174899, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 2.3098, "step": 599 }, { "epoch": 0.1883165286986543, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 2.7208, "step": 600 }, { "epoch": 0.18863038957981876, "grad_norm": 0.1162109375, "learning_rate": 0.0002, "loss": 1.5596, "step": 601 }, { "epoch": 0.18894425046098318, "grad_norm": 0.11474609375, "learning_rate": 0.0002, "loss": 1.4349, "step": 602 }, { "epoch": 0.1892581113421476, "grad_norm": 0.1123046875, "learning_rate": 0.0002, "loss": 1.5032, "step": 603 }, { "epoch": 0.189571972223312, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.4491, "step": 604 }, { "epoch": 0.18988583310447643, "grad_norm": 0.1201171875, "learning_rate": 0.0002, "loss": 1.4891, "step": 605 }, { "epoch": 0.19019969398564088, "grad_norm": 0.111328125, "learning_rate": 0.0002, "loss": 1.35, "step": 606 }, { "epoch": 0.1905135548668053, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5648, "step": 607 }, { "epoch": 0.19082741574796971, "grad_norm": 0.1103515625, "learning_rate": 0.0002, "loss": 1.3777, "step": 608 }, { "epoch": 0.19114127662913413, "grad_norm": 0.1240234375, "learning_rate": 0.0002, "loss": 1.3868, "step": 609 }, { "epoch": 0.19145513751029855, "grad_norm": 0.125, "learning_rate": 0.0002, "loss": 1.4256, "step": 610 }, { "epoch": 0.191768998391463, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.4123, "step": 611 }, { "epoch": 0.19208285927262742, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.5571, "step": 612 }, { "epoch": 0.19239672015379183, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6238, "step": 613 }, { "epoch": 0.19271058103495625, "grad_norm": 0.130859375, "learning_rate": 0.0002, "loss": 1.5166, "step": 614 }, { "epoch": 0.19302444191612067, "grad_norm": 0.1259765625, "learning_rate": 0.0002, "loss": 1.4666, "step": 615 }, { "epoch": 0.19333830279728512, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5169, "step": 616 }, { "epoch": 0.19365216367844953, "grad_norm": 0.1240234375, "learning_rate": 0.0002, "loss": 1.3763, "step": 617 }, { "epoch": 0.19396602455961395, "grad_norm": 0.1201171875, "learning_rate": 0.0002, "loss": 1.3067, "step": 618 }, { "epoch": 0.19427988544077837, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.6093, "step": 619 }, { "epoch": 0.1945937463219428, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.3269, "step": 620 }, { "epoch": 0.19490760720310724, "grad_norm": 0.2255859375, "learning_rate": 0.0002, "loss": 1.4157, "step": 621 }, { "epoch": 0.19522146808427165, "grad_norm": 0.1982421875, "learning_rate": 0.0002, "loss": 1.4526, "step": 622 }, { "epoch": 0.19553532896543607, "grad_norm": 0.2216796875, "learning_rate": 0.0002, "loss": 1.6507, "step": 623 }, { "epoch": 0.1958491898466005, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.7315, "step": 624 }, { "epoch": 0.1961630507277649, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 1.7668, "step": 625 }, { "epoch": 0.19647691160892936, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 1.7066, "step": 626 }, { "epoch": 0.19679077249009377, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 1.5872, "step": 627 }, { "epoch": 0.1971046333712582, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.4538, "step": 628 }, { "epoch": 0.1974184942524226, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 1.8707, "step": 629 }, { "epoch": 0.19773235513358703, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 2.0015, "step": 630 }, { "epoch": 0.19804621601475147, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 2.126, "step": 631 }, { "epoch": 0.1983600768959159, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 2.274, "step": 632 }, { "epoch": 0.1986739377770803, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 2.1807, "step": 633 }, { "epoch": 0.19898779865824473, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 2.1142, "step": 634 }, { "epoch": 0.19930165953940915, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 2.6402, "step": 635 }, { "epoch": 0.1996155204205736, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 2.4704, "step": 636 }, { "epoch": 0.199929381301738, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 2.5016, "step": 637 }, { "epoch": 0.20024324218290243, "grad_norm": 1.2890625, "learning_rate": 0.0002, "loss": 2.6837, "step": 638 }, { "epoch": 0.20055710306406685, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 2.4928, "step": 639 }, { "epoch": 0.20087096394523127, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 2.2994, "step": 640 }, { "epoch": 0.20118482482639571, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 2.6428, "step": 641 }, { "epoch": 0.20149868570756013, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 2.0929, "step": 642 }, { "epoch": 0.20181254658872455, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 2.5125, "step": 643 }, { "epoch": 0.20212640746988897, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 2.3, "step": 644 }, { "epoch": 0.2024402683510534, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 2.3705, "step": 645 }, { "epoch": 0.2027541292322178, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 1.7512, "step": 646 }, { "epoch": 0.20306799011338225, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 2.19, "step": 647 }, { "epoch": 0.20338185099454667, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 2.2673, "step": 648 }, { "epoch": 0.2036957118757111, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.8386, "step": 649 }, { "epoch": 0.2040095727568755, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 3.1839, "step": 650 }, { "epoch": 0.20432343363803993, "grad_norm": 0.08642578125, "learning_rate": 0.0002, "loss": 1.381, "step": 651 }, { "epoch": 0.20463729451920437, "grad_norm": 0.1259765625, "learning_rate": 0.0002, "loss": 1.2011, "step": 652 }, { "epoch": 0.2049511554003688, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.479, "step": 653 }, { "epoch": 0.2052650162815332, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.4244, "step": 654 }, { "epoch": 0.20557887716269763, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5518, "step": 655 }, { "epoch": 0.20589273804386204, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.4713, "step": 656 }, { "epoch": 0.2062065989250265, "grad_norm": 0.2333984375, "learning_rate": 0.0002, "loss": 1.537, "step": 657 }, { "epoch": 0.2065204598061909, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.376, "step": 658 }, { "epoch": 0.20683432068735533, "grad_norm": 0.1328125, "learning_rate": 0.0002, "loss": 1.5974, "step": 659 }, { "epoch": 0.20714818156851975, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.5159, "step": 660 }, { "epoch": 0.20746204244968416, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.4994, "step": 661 }, { "epoch": 0.2077759033308486, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.4817, "step": 662 }, { "epoch": 0.20808976421201303, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.4856, "step": 663 }, { "epoch": 0.20840362509317745, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.5026, "step": 664 }, { "epoch": 0.20871748597434187, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.4981, "step": 665 }, { "epoch": 0.20903134685550628, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.3784, "step": 666 }, { "epoch": 0.20934520773667073, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.2459, "step": 667 }, { "epoch": 0.20965906861783515, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5568, "step": 668 }, { "epoch": 0.20997292949899957, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.6353, "step": 669 }, { "epoch": 0.21028679038016398, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5004, "step": 670 }, { "epoch": 0.2106006512613284, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 2.0955, "step": 671 }, { "epoch": 0.21091451214249285, "grad_norm": 0.21484375, "learning_rate": 0.0002, "loss": 1.6436, "step": 672 }, { "epoch": 0.21122837302365727, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 1.5068, "step": 673 }, { "epoch": 0.21154223390482169, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.7045, "step": 674 }, { "epoch": 0.2118560947859861, "grad_norm": 0.1982421875, "learning_rate": 0.0002, "loss": 1.6256, "step": 675 }, { "epoch": 0.21216995566715052, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.9182, "step": 676 }, { "epoch": 0.21248381654831497, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 2.1139, "step": 677 }, { "epoch": 0.2127976774294794, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.9642, "step": 678 }, { "epoch": 0.2131115383106438, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.7713, "step": 679 }, { "epoch": 0.21342539919180822, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 2.1779, "step": 680 }, { "epoch": 0.21373926007297264, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.6468, "step": 681 }, { "epoch": 0.2140531209541371, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 2.0927, "step": 682 }, { "epoch": 0.2143669818353015, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 2.1903, "step": 683 }, { "epoch": 0.21468084271646592, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 2.4789, "step": 684 }, { "epoch": 0.21499470359763034, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 2.1381, "step": 685 }, { "epoch": 0.21530856447879476, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 2.0279, "step": 686 }, { "epoch": 0.2156224253599592, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 2.5657, "step": 687 }, { "epoch": 0.21593628624112363, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 2.4605, "step": 688 }, { "epoch": 0.21625014712228804, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 2.2759, "step": 689 }, { "epoch": 0.21656400800345246, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 2.2051, "step": 690 }, { "epoch": 0.21687786888461688, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 2.3526, "step": 691 }, { "epoch": 0.21719172976578133, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 2.3279, "step": 692 }, { "epoch": 0.21750559064694575, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 2.2478, "step": 693 }, { "epoch": 0.21781945152811016, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 2.5623, "step": 694 }, { "epoch": 0.21813331240927458, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 2.5045, "step": 695 }, { "epoch": 0.218447173290439, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 2.1917, "step": 696 }, { "epoch": 0.21876103417160345, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 2.473, "step": 697 }, { "epoch": 0.21907489505276787, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 2.4232, "step": 698 }, { "epoch": 0.21938875593393228, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 2.1088, "step": 699 }, { "epoch": 0.2197026168150967, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 3.1141, "step": 700 }, { "epoch": 0.22001647769626112, "grad_norm": 0.09521484375, "learning_rate": 0.0002, "loss": 1.4641, "step": 701 }, { "epoch": 0.22033033857742557, "grad_norm": 0.1923828125, "learning_rate": 0.0002, "loss": 1.5301, "step": 702 }, { "epoch": 0.22064419945858998, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.4995, "step": 703 }, { "epoch": 0.2209580603397544, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.5175, "step": 704 }, { "epoch": 0.22127192122091882, "grad_norm": 0.12255859375, "learning_rate": 0.0002, "loss": 1.5367, "step": 705 }, { "epoch": 0.22158578210208324, "grad_norm": 0.1279296875, "learning_rate": 0.0002, "loss": 1.4447, "step": 706 }, { "epoch": 0.22189964298324769, "grad_norm": 0.1142578125, "learning_rate": 0.0002, "loss": 1.3938, "step": 707 }, { "epoch": 0.2222135038644121, "grad_norm": 0.12890625, "learning_rate": 0.0002, "loss": 1.4849, "step": 708 }, { "epoch": 0.22252736474557652, "grad_norm": 0.11474609375, "learning_rate": 0.0002, "loss": 1.4433, "step": 709 }, { "epoch": 0.22284122562674094, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.455, "step": 710 }, { "epoch": 0.22315508650790536, "grad_norm": 0.1181640625, "learning_rate": 0.0002, "loss": 1.3936, "step": 711 }, { "epoch": 0.2234689473890698, "grad_norm": 0.12255859375, "learning_rate": 0.0002, "loss": 1.6169, "step": 712 }, { "epoch": 0.22378280827023422, "grad_norm": 0.12255859375, "learning_rate": 0.0002, "loss": 1.526, "step": 713 }, { "epoch": 0.22409666915139864, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.4675, "step": 714 }, { "epoch": 0.22441053003256306, "grad_norm": 0.1328125, "learning_rate": 0.0002, "loss": 1.4285, "step": 715 }, { "epoch": 0.22472439091372748, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.3881, "step": 716 }, { "epoch": 0.22503825179489192, "grad_norm": 0.12158203125, "learning_rate": 0.0002, "loss": 1.4186, "step": 717 }, { "epoch": 0.22535211267605634, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.4384, "step": 718 }, { "epoch": 0.22566597355722076, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5125, "step": 719 }, { "epoch": 0.22597983443838518, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.4096, "step": 720 }, { "epoch": 0.2262936953195496, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.5214, "step": 721 }, { "epoch": 0.22660755620071404, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.5779, "step": 722 }, { "epoch": 0.22692141708187846, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.3637, "step": 723 }, { "epoch": 0.22723527796304288, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.4644, "step": 724 }, { "epoch": 0.2275491388442073, "grad_norm": 0.2333984375, "learning_rate": 0.0002, "loss": 1.8246, "step": 725 }, { "epoch": 0.22786299972537172, "grad_norm": 0.2236328125, "learning_rate": 0.0002, "loss": 1.8298, "step": 726 }, { "epoch": 0.22817686060653616, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 2.0097, "step": 727 }, { "epoch": 0.22849072148770058, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.8344, "step": 728 }, { "epoch": 0.228804582368865, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 1.8355, "step": 729 }, { "epoch": 0.22911844325002942, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 2.2533, "step": 730 }, { "epoch": 0.22943230413119384, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.7191, "step": 731 }, { "epoch": 0.22974616501235828, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 2.0636, "step": 732 }, { "epoch": 0.2300600258935227, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 2.1691, "step": 733 }, { "epoch": 0.23037388677468712, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 2.4264, "step": 734 }, { "epoch": 0.23068774765585154, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 2.3442, "step": 735 }, { "epoch": 0.23100160853701596, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 2.2909, "step": 736 }, { "epoch": 0.2313154694181804, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 2.2776, "step": 737 }, { "epoch": 0.23162933029934482, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 2.1652, "step": 738 }, { "epoch": 0.23194319118050924, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 2.3553, "step": 739 }, { "epoch": 0.23225705206167366, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 2.2077, "step": 740 }, { "epoch": 0.23257091294283808, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 1.9861, "step": 741 }, { "epoch": 0.23288477382400252, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.9471, "step": 742 }, { "epoch": 0.23319863470516694, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.9327, "step": 743 }, { "epoch": 0.23351249558633136, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 2.4741, "step": 744 }, { "epoch": 0.23382635646749578, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 2.1698, "step": 745 }, { "epoch": 0.2341402173486602, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 2.3602, "step": 746 }, { "epoch": 0.23445407822982464, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 1.7317, "step": 747 }, { "epoch": 0.23476793911098906, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 1.8612, "step": 748 }, { "epoch": 0.23476793911098906, "eval_loss": 1.8301838636398315, "eval_runtime": 123.7044, "eval_samples_per_second": 8.084, "eval_steps_per_second": 8.084, "step": 748 }, { "epoch": 0.23476793911098906, "mmlu_eval_accuracy": 0.43586306684768467, "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, "mmlu_eval_accuracy_anatomy": 0.5714285714285714, "mmlu_eval_accuracy_astronomy": 0.5625, "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, "mmlu_eval_accuracy_clinical_knowledge": 0.3793103448275862, "mmlu_eval_accuracy_college_biology": 0.25, "mmlu_eval_accuracy_college_chemistry": 0.25, "mmlu_eval_accuracy_college_computer_science": 0.45454545454545453, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, "mmlu_eval_accuracy_econometrics": 0.25, "mmlu_eval_accuracy_electrical_engineering": 0.4375, "mmlu_eval_accuracy_elementary_mathematics": 0.3902439024390244, "mmlu_eval_accuracy_formal_logic": 0.21428571428571427, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.375, "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556, "mmlu_eval_accuracy_high_school_european_history": 0.5, "mmlu_eval_accuracy_high_school_geography": 0.5, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5238095238095238, "mmlu_eval_accuracy_high_school_macroeconomics": 0.4186046511627907, "mmlu_eval_accuracy_high_school_mathematics": 0.3448275862068966, "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, "mmlu_eval_accuracy_high_school_physics": 0.29411764705882354, "mmlu_eval_accuracy_high_school_psychology": 0.6166666666666667, "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, "mmlu_eval_accuracy_high_school_us_history": 0.6363636363636364, "mmlu_eval_accuracy_high_school_world_history": 0.5769230769230769, "mmlu_eval_accuracy_human_aging": 0.5652173913043478, "mmlu_eval_accuracy_human_sexuality": 0.5, "mmlu_eval_accuracy_international_law": 0.8461538461538461, "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, "mmlu_eval_accuracy_logical_fallacies": 0.5, "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, "mmlu_eval_accuracy_management": 0.45454545454545453, "mmlu_eval_accuracy_marketing": 0.76, "mmlu_eval_accuracy_medical_genetics": 0.8181818181818182, "mmlu_eval_accuracy_miscellaneous": 0.5348837209302325, "mmlu_eval_accuracy_moral_disputes": 0.42105263157894735, "mmlu_eval_accuracy_moral_scenarios": 0.25, "mmlu_eval_accuracy_nutrition": 0.45454545454545453, "mmlu_eval_accuracy_philosophy": 0.5, "mmlu_eval_accuracy_prehistory": 0.37142857142857144, "mmlu_eval_accuracy_professional_accounting": 0.25806451612903225, "mmlu_eval_accuracy_professional_law": 0.3, "mmlu_eval_accuracy_professional_medicine": 0.41935483870967744, "mmlu_eval_accuracy_professional_psychology": 0.3333333333333333, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.4074074074074074, "mmlu_eval_accuracy_sociology": 0.6818181818181818, "mmlu_eval_accuracy_us_foreign_policy": 0.6363636363636364, "mmlu_eval_accuracy_virology": 0.5, "mmlu_eval_accuracy_world_religions": 0.6842105263157895, "mmlu_loss": 0.9066392678516819, "step": 748 }, { "epoch": 0.23508179999215348, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 2.4457, "step": 749 }, { "epoch": 0.2353956608733179, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 3.3922, "step": 750 }, { "epoch": 0.23570952175448232, "grad_norm": 0.1005859375, "learning_rate": 0.0002, "loss": 1.3832, "step": 751 }, { "epoch": 0.23602338263564676, "grad_norm": 0.1298828125, "learning_rate": 0.0002, "loss": 1.4088, "step": 752 }, { "epoch": 0.23633724351681118, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.4099, "step": 753 }, { "epoch": 0.2366511043979756, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.494, "step": 754 }, { "epoch": 0.23696496527914002, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.465, "step": 755 }, { "epoch": 0.23727882616030443, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5089, "step": 756 }, { "epoch": 0.23759268704146888, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5011, "step": 757 }, { "epoch": 0.2379065479226333, "grad_norm": 0.11962890625, "learning_rate": 0.0002, "loss": 1.3608, "step": 758 }, { "epoch": 0.23822040880379772, "grad_norm": 0.1240234375, "learning_rate": 0.0002, "loss": 1.392, "step": 759 }, { "epoch": 0.23853426968496214, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.5432, "step": 760 }, { "epoch": 0.23884813056612655, "grad_norm": 0.125, "learning_rate": 0.0002, "loss": 1.594, "step": 761 }, { "epoch": 0.239161991447291, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.57, "step": 762 }, { "epoch": 0.23947585232845542, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.407, "step": 763 }, { "epoch": 0.23978971320961984, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.3097, "step": 764 }, { "epoch": 0.24010357409078426, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5793, "step": 765 }, { "epoch": 0.24041743497194867, "grad_norm": 0.11328125, "learning_rate": 0.0002, "loss": 1.2891, "step": 766 }, { "epoch": 0.24073129585311312, "grad_norm": 0.1240234375, "learning_rate": 0.0002, "loss": 1.3422, "step": 767 }, { "epoch": 0.24104515673427754, "grad_norm": 0.189453125, "learning_rate": 0.0002, "loss": 1.614, "step": 768 }, { "epoch": 0.24135901761544196, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.4728, "step": 769 }, { "epoch": 0.24167287849660637, "grad_norm": 0.2041015625, "learning_rate": 0.0002, "loss": 1.45, "step": 770 }, { "epoch": 0.2419867393777708, "grad_norm": 0.2451171875, "learning_rate": 0.0002, "loss": 1.8436, "step": 771 }, { "epoch": 0.24230060025893524, "grad_norm": 0.205078125, "learning_rate": 0.0002, "loss": 1.9457, "step": 772 }, { "epoch": 0.24261446114009966, "grad_norm": 0.2255859375, "learning_rate": 0.0002, "loss": 1.6809, "step": 773 }, { "epoch": 0.24292832202126408, "grad_norm": 0.2099609375, "learning_rate": 0.0002, "loss": 1.3838, "step": 774 }, { "epoch": 0.2432421829024285, "grad_norm": 0.2060546875, "learning_rate": 0.0002, "loss": 1.446, "step": 775 }, { "epoch": 0.2435560437835929, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.7502, "step": 776 }, { "epoch": 0.24386990466475736, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 2.0686, "step": 777 }, { "epoch": 0.24418376554592178, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.6674, "step": 778 }, { "epoch": 0.2444976264270862, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 2.1236, "step": 779 }, { "epoch": 0.2448114873082506, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 2.2482, "step": 780 }, { "epoch": 0.24512534818941503, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.8243, "step": 781 }, { "epoch": 0.24543920907057948, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 2.0005, "step": 782 }, { "epoch": 0.2457530699517439, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 1.8594, "step": 783 }, { "epoch": 0.24606693083290831, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 1.9771, "step": 784 }, { "epoch": 0.24638079171407273, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 2.1506, "step": 785 }, { "epoch": 0.24669465259523715, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 2.0608, "step": 786 }, { "epoch": 0.2470085134764016, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 2.2639, "step": 787 }, { "epoch": 0.24732237435756602, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 2.4051, "step": 788 }, { "epoch": 0.24763623523873043, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 2.3805, "step": 789 }, { "epoch": 0.24795009611989485, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 2.2338, "step": 790 }, { "epoch": 0.24826395700105927, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 2.2317, "step": 791 }, { "epoch": 0.24857781788222372, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 2.3565, "step": 792 }, { "epoch": 0.24889167876338814, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 3.0328, "step": 793 }, { "epoch": 0.24920553964455255, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 2.253, "step": 794 }, { "epoch": 0.24951940052571697, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 2.3392, "step": 795 }, { "epoch": 0.2498332614068814, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 2.2872, "step": 796 }, { "epoch": 0.2501471222880458, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 2.3179, "step": 797 }, { "epoch": 0.2504609831692102, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 2.6253, "step": 798 }, { "epoch": 0.25077484405037465, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 2.6689, "step": 799 }, { "epoch": 0.2510887049315391, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 2.76, "step": 800 }, { "epoch": 0.25140256581270354, "grad_norm": 0.10595703125, "learning_rate": 0.0002, "loss": 1.4852, "step": 801 }, { "epoch": 0.25171642669386796, "grad_norm": 0.11474609375, "learning_rate": 0.0002, "loss": 1.4893, "step": 802 }, { "epoch": 0.2520302875750324, "grad_norm": 0.11669921875, "learning_rate": 0.0002, "loss": 1.4828, "step": 803 }, { "epoch": 0.2523441484561968, "grad_norm": 0.12890625, "learning_rate": 0.0002, "loss": 1.4697, "step": 804 }, { "epoch": 0.2526580093373612, "grad_norm": 0.1220703125, "learning_rate": 0.0002, "loss": 1.4742, "step": 805 }, { "epoch": 0.25297187021852563, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.4976, "step": 806 }, { "epoch": 0.25328573109969005, "grad_norm": 0.12353515625, "learning_rate": 0.0002, "loss": 1.4181, "step": 807 }, { "epoch": 0.25359959198085447, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.521, "step": 808 }, { "epoch": 0.2539134528620189, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5012, "step": 809 }, { "epoch": 0.25422731374318336, "grad_norm": 0.10595703125, "learning_rate": 0.0002, "loss": 1.406, "step": 810 }, { "epoch": 0.2545411746243478, "grad_norm": 0.119140625, "learning_rate": 0.0002, "loss": 1.463, "step": 811 }, { "epoch": 0.2548550355055122, "grad_norm": 0.1201171875, "learning_rate": 0.0002, "loss": 1.5657, "step": 812 }, { "epoch": 0.2551688963866766, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.4766, "step": 813 }, { "epoch": 0.25548275726784103, "grad_norm": 0.12255859375, "learning_rate": 0.0002, "loss": 1.5833, "step": 814 }, { "epoch": 0.25579661814900545, "grad_norm": 0.12451171875, "learning_rate": 0.0002, "loss": 1.3477, "step": 815 }, { "epoch": 0.25611047903016987, "grad_norm": 0.12890625, "learning_rate": 0.0002, "loss": 1.4316, "step": 816 }, { "epoch": 0.2564243399113343, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.442, "step": 817 }, { "epoch": 0.2567382007924987, "grad_norm": 0.11669921875, "learning_rate": 0.0002, "loss": 1.3093, "step": 818 }, { "epoch": 0.2570520616736631, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5933, "step": 819 }, { "epoch": 0.2573659225548276, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.439, "step": 820 }, { "epoch": 0.257679783435992, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.4433, "step": 821 }, { "epoch": 0.25799364431715643, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 1.5624, "step": 822 }, { "epoch": 0.25830750519832085, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.4514, "step": 823 }, { "epoch": 0.25862136607948527, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.4493, "step": 824 }, { "epoch": 0.2589352269606497, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.5928, "step": 825 }, { "epoch": 0.2592490878418141, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.5877, "step": 826 }, { "epoch": 0.2595629487229785, "grad_norm": 0.2333984375, "learning_rate": 0.0002, "loss": 1.7294, "step": 827 }, { "epoch": 0.25987680960414294, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.7468, "step": 828 }, { "epoch": 0.26019067048530736, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 2.0077, "step": 829 }, { "epoch": 0.26050453136647184, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 2.0081, "step": 830 }, { "epoch": 0.26081839224763625, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.8417, "step": 831 }, { "epoch": 0.2611322531288007, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.9381, "step": 832 }, { "epoch": 0.2614461140099651, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 2.2372, "step": 833 }, { "epoch": 0.2617599748911295, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 2.2899, "step": 834 }, { "epoch": 0.26207383577229393, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 2.0124, "step": 835 }, { "epoch": 0.26238769665345835, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.9612, "step": 836 }, { "epoch": 0.26270155753462276, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 2.2552, "step": 837 }, { "epoch": 0.2630154184157872, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 2.4561, "step": 838 }, { "epoch": 0.2633292792969516, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 2.6507, "step": 839 }, { "epoch": 0.2636431401781161, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 2.3272, "step": 840 }, { "epoch": 0.2639570010592805, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 1.8104, "step": 841 }, { "epoch": 0.2642708619404449, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 2.3128, "step": 842 }, { "epoch": 0.26458472282160933, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 2.4287, "step": 843 }, { "epoch": 0.26489858370277375, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 2.56, "step": 844 }, { "epoch": 0.26521244458393817, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 2.6142, "step": 845 }, { "epoch": 0.2655263054651026, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 1.8461, "step": 846 }, { "epoch": 0.265840166346267, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.9298, "step": 847 }, { "epoch": 0.2661540272274314, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 1.9589, "step": 848 }, { "epoch": 0.26646788810859584, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 2.3589, "step": 849 }, { "epoch": 0.2667817489897603, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 2.5415, "step": 850 }, { "epoch": 0.26709560987092473, "grad_norm": 0.09765625, "learning_rate": 0.0002, "loss": 1.3115, "step": 851 }, { "epoch": 0.26740947075208915, "grad_norm": 0.123046875, "learning_rate": 0.0002, "loss": 1.4415, "step": 852 }, { "epoch": 0.26772333163325357, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.3668, "step": 853 }, { "epoch": 0.268037192514418, "grad_norm": 0.130859375, "learning_rate": 0.0002, "loss": 1.5009, "step": 854 }, { "epoch": 0.2683510533955824, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.6886, "step": 855 }, { "epoch": 0.2686649142767468, "grad_norm": 0.126953125, "learning_rate": 0.0002, "loss": 1.4729, "step": 856 }, { "epoch": 0.26897877515791124, "grad_norm": 0.123046875, "learning_rate": 0.0002, "loss": 1.4901, "step": 857 }, { "epoch": 0.26929263603907566, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.539, "step": 858 }, { "epoch": 0.2696064969202401, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.4547, "step": 859 }, { "epoch": 0.26992035780140455, "grad_norm": 0.123046875, "learning_rate": 0.0002, "loss": 1.4896, "step": 860 }, { "epoch": 0.27023421868256897, "grad_norm": 0.11865234375, "learning_rate": 0.0002, "loss": 1.3954, "step": 861 }, { "epoch": 0.2705480795637334, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.5198, "step": 862 }, { "epoch": 0.2708619404448978, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5698, "step": 863 }, { "epoch": 0.2711758013260622, "grad_norm": 0.1328125, "learning_rate": 0.0002, "loss": 1.4155, "step": 864 }, { "epoch": 0.27148966220722665, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.5329, "step": 865 }, { "epoch": 0.27180352308839106, "grad_norm": 0.125, "learning_rate": 0.0002, "loss": 1.4575, "step": 866 }, { "epoch": 0.2721173839695555, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.4322, "step": 867 }, { "epoch": 0.2724312448507199, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.3853, "step": 868 }, { "epoch": 0.2727451057318843, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.7375, "step": 869 }, { "epoch": 0.2730589666130488, "grad_norm": 0.2109375, "learning_rate": 0.0002, "loss": 1.5402, "step": 870 }, { "epoch": 0.2733728274942132, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.6293, "step": 871 }, { "epoch": 0.27368668837537763, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.5219, "step": 872 }, { "epoch": 0.27400054925654205, "grad_norm": 0.21484375, "learning_rate": 0.0002, "loss": 1.6687, "step": 873 }, { "epoch": 0.27431441013770647, "grad_norm": 0.2451171875, "learning_rate": 0.0002, "loss": 1.7818, "step": 874 }, { "epoch": 0.2746282710188709, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.887, "step": 875 }, { "epoch": 0.2749421319000353, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.7887, "step": 876 }, { "epoch": 0.2752559927811997, "grad_norm": 0.248046875, "learning_rate": 0.0002, "loss": 1.6122, "step": 877 }, { "epoch": 0.27556985366236414, "grad_norm": 0.23828125, "learning_rate": 0.0002, "loss": 1.6951, "step": 878 }, { "epoch": 0.27588371454352856, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 2.0917, "step": 879 }, { "epoch": 0.27619757542469303, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 2.3369, "step": 880 }, { "epoch": 0.27651143630585745, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 2.1028, "step": 881 }, { "epoch": 0.27682529718702187, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.8433, "step": 882 }, { "epoch": 0.2771391580681863, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 2.2209, "step": 883 }, { "epoch": 0.2774530189493507, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 2.4793, "step": 884 }, { "epoch": 0.2777668798305151, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 2.1978, "step": 885 }, { "epoch": 0.27808074071167954, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 2.6263, "step": 886 }, { "epoch": 0.27839460159284396, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 2.1229, "step": 887 }, { "epoch": 0.2787084624740084, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 2.3003, "step": 888 }, { "epoch": 0.2790223233551728, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 2.4488, "step": 889 }, { "epoch": 0.27933618423633727, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 2.2592, "step": 890 }, { "epoch": 0.2796500451175017, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 2.0843, "step": 891 }, { "epoch": 0.2799639059986661, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 2.2622, "step": 892 }, { "epoch": 0.2802777668798305, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 2.2364, "step": 893 }, { "epoch": 0.28059162776099494, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 2.0579, "step": 894 }, { "epoch": 0.28090548864215936, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 2.7902, "step": 895 }, { "epoch": 0.2812193495233238, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 2.2274, "step": 896 }, { "epoch": 0.2815332104044882, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.9381, "step": 897 }, { "epoch": 0.2818470712856526, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 2.208, "step": 898 }, { "epoch": 0.28216093216681704, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 2.189, "step": 899 }, { "epoch": 0.2824747930479815, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 2.8205, "step": 900 }, { "epoch": 0.28278865392914593, "grad_norm": 0.095703125, "learning_rate": 0.0002, "loss": 1.3313, "step": 901 }, { "epoch": 0.28310251481031035, "grad_norm": 0.115234375, "learning_rate": 0.0002, "loss": 1.4316, "step": 902 }, { "epoch": 0.28341637569147476, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6624, "step": 903 }, { "epoch": 0.2837302365726392, "grad_norm": 0.1279296875, "learning_rate": 0.0002, "loss": 1.4856, "step": 904 }, { "epoch": 0.2840440974538036, "grad_norm": 0.1298828125, "learning_rate": 0.0002, "loss": 1.3935, "step": 905 }, { "epoch": 0.284357958334968, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5396, "step": 906 }, { "epoch": 0.28467181921613244, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.51, "step": 907 }, { "epoch": 0.28498568009729686, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.4975, "step": 908 }, { "epoch": 0.2852995409784613, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6255, "step": 909 }, { "epoch": 0.28561340185962575, "grad_norm": 0.1318359375, "learning_rate": 0.0002, "loss": 1.4139, "step": 910 }, { "epoch": 0.28592726274079017, "grad_norm": 0.11865234375, "learning_rate": 0.0002, "loss": 1.4564, "step": 911 }, { "epoch": 0.2862411236219546, "grad_norm": 0.1220703125, "learning_rate": 0.0002, "loss": 1.5092, "step": 912 }, { "epoch": 0.286554984503119, "grad_norm": 0.11328125, "learning_rate": 0.0002, "loss": 1.3182, "step": 913 }, { "epoch": 0.2868688453842834, "grad_norm": 0.130859375, "learning_rate": 0.0002, "loss": 1.3046, "step": 914 }, { "epoch": 0.28718270626544784, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.34, "step": 915 }, { "epoch": 0.28749656714661226, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.4487, "step": 916 }, { "epoch": 0.2878104280277767, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.4902, "step": 917 }, { "epoch": 0.2881242889089411, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.4275, "step": 918 }, { "epoch": 0.2884381497901055, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5213, "step": 919 }, { "epoch": 0.28875201067126993, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.1849, "step": 920 }, { "epoch": 0.2890658715524344, "grad_norm": 0.1884765625, "learning_rate": 0.0002, "loss": 1.6107, "step": 921 }, { "epoch": 0.2893797324335988, "grad_norm": 0.2490234375, "learning_rate": 0.0002, "loss": 1.7007, "step": 922 }, { "epoch": 0.28969359331476324, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.6828, "step": 923 }, { "epoch": 0.29000745419592766, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.8089, "step": 924 }, { "epoch": 0.2903213150770921, "grad_norm": 0.2138671875, "learning_rate": 0.0002, "loss": 1.8047, "step": 925 }, { "epoch": 0.2906351759582565, "grad_norm": 0.2451171875, "learning_rate": 0.0002, "loss": 1.6016, "step": 926 }, { "epoch": 0.2909490368394209, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 2.0681, "step": 927 }, { "epoch": 0.29126289772058533, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.8313, "step": 928 }, { "epoch": 0.29157675860174975, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 2.4114, "step": 929 }, { "epoch": 0.29189061948291417, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 2.2211, "step": 930 }, { "epoch": 0.29220448036407864, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.9012, "step": 931 }, { "epoch": 0.29251834124524306, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 2.1886, "step": 932 }, { "epoch": 0.2928322021264075, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 2.1408, "step": 933 }, { "epoch": 0.2931460630075719, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 2.3237, "step": 934 }, { "epoch": 0.2934599238887363, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 2.4881, "step": 935 }, { "epoch": 0.2934599238887363, "eval_loss": 1.8006056547164917, "eval_runtime": 149.6622, "eval_samples_per_second": 6.682, "eval_steps_per_second": 6.682, "step": 935 }, { "epoch": 0.2934599238887363, "mmlu_eval_accuracy": 0.42205537825180356, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.4375, "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, "mmlu_eval_accuracy_clinical_knowledge": 0.3103448275862069, "mmlu_eval_accuracy_college_biology": 0.1875, "mmlu_eval_accuracy_college_chemistry": 0.25, "mmlu_eval_accuracy_college_computer_science": 0.45454545454545453, "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.45454545454545453, "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.375, "mmlu_eval_accuracy_elementary_mathematics": 0.34146341463414637, "mmlu_eval_accuracy_formal_logic": 0.2857142857142857, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.34375, "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, "mmlu_eval_accuracy_high_school_computer_science": 0.4444444444444444, "mmlu_eval_accuracy_high_school_european_history": 0.5555555555555556, "mmlu_eval_accuracy_high_school_geography": 0.5454545454545454, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5714285714285714, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3488372093023256, "mmlu_eval_accuracy_high_school_mathematics": 0.3448275862068966, "mmlu_eval_accuracy_high_school_microeconomics": 0.2692307692307692, "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, "mmlu_eval_accuracy_high_school_psychology": 0.6166666666666667, "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, "mmlu_eval_accuracy_high_school_us_history": 0.5909090909090909, "mmlu_eval_accuracy_high_school_world_history": 0.5384615384615384, "mmlu_eval_accuracy_human_aging": 0.6086956521739131, "mmlu_eval_accuracy_human_sexuality": 0.5, "mmlu_eval_accuracy_international_law": 0.7692307692307693, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.6111111111111112, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.2727272727272727, "mmlu_eval_accuracy_marketing": 0.8, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.5813953488372093, "mmlu_eval_accuracy_moral_disputes": 0.47368421052631576, "mmlu_eval_accuracy_moral_scenarios": 0.24, "mmlu_eval_accuracy_nutrition": 0.48484848484848486, "mmlu_eval_accuracy_philosophy": 0.5, "mmlu_eval_accuracy_prehistory": 0.3142857142857143, "mmlu_eval_accuracy_professional_accounting": 0.45161290322580644, "mmlu_eval_accuracy_professional_law": 0.3235294117647059, "mmlu_eval_accuracy_professional_medicine": 0.3870967741935484, "mmlu_eval_accuracy_professional_psychology": 0.3333333333333333, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.5454545454545454, "mmlu_eval_accuracy_us_foreign_policy": 0.5454545454545454, "mmlu_eval_accuracy_virology": 0.4444444444444444, "mmlu_eval_accuracy_world_religions": 0.6842105263157895, "mmlu_loss": 1.0942840913905574, "step": 935 }, { "epoch": 0.29377378476990074, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 2.1141, "step": 936 }, { "epoch": 0.29408764565106515, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 2.6761, "step": 937 }, { "epoch": 0.2944015065322296, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 2.0653, "step": 938 }, { "epoch": 0.294715367413394, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 2.1641, "step": 939 }, { "epoch": 0.2950292282945584, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 2.2414, "step": 940 }, { "epoch": 0.2953430891757229, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 2.3736, "step": 941 }, { "epoch": 0.2956569500568873, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 2.3303, "step": 942 }, { "epoch": 0.2959708109380517, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 2.3767, "step": 943 }, { "epoch": 0.29628467181921614, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 2.5385, "step": 944 }, { "epoch": 0.29659853270038056, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 2.4962, "step": 945 }, { "epoch": 0.296912393581545, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 2.0142, "step": 946 }, { "epoch": 0.2972262544627094, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 2.0577, "step": 947 }, { "epoch": 0.2975401153438738, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 2.6295, "step": 948 }, { "epoch": 0.29785397622503823, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 1.9394, "step": 949 }, { "epoch": 0.29816783710620265, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 2.9883, "step": 950 }, { "epoch": 0.2984816979873671, "grad_norm": 0.095703125, "learning_rate": 0.0002, "loss": 1.4738, "step": 951 }, { "epoch": 0.29879555886853154, "grad_norm": 0.11474609375, "learning_rate": 0.0002, "loss": 1.43, "step": 952 }, { "epoch": 0.29910941974969596, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.4107, "step": 953 }, { "epoch": 0.2994232806308604, "grad_norm": 0.11865234375, "learning_rate": 0.0002, "loss": 1.3789, "step": 954 }, { "epoch": 0.2997371415120248, "grad_norm": 0.1298828125, "learning_rate": 0.0002, "loss": 1.4341, "step": 955 }, { "epoch": 0.3000510023931892, "grad_norm": 0.1181640625, "learning_rate": 0.0002, "loss": 1.5085, "step": 956 }, { "epoch": 0.30036486327435363, "grad_norm": 0.1181640625, "learning_rate": 0.0002, "loss": 1.3617, "step": 957 }, { "epoch": 0.30067872415551805, "grad_norm": 0.125, "learning_rate": 0.0002, "loss": 1.5131, "step": 958 }, { "epoch": 0.30099258503668247, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6087, "step": 959 }, { "epoch": 0.3013064459178469, "grad_norm": 0.1103515625, "learning_rate": 0.0002, "loss": 1.3892, "step": 960 }, { "epoch": 0.30162030679901136, "grad_norm": 0.12255859375, "learning_rate": 0.0002, "loss": 1.372, "step": 961 }, { "epoch": 0.3019341676801758, "grad_norm": 0.11572265625, "learning_rate": 0.0002, "loss": 1.3744, "step": 962 }, { "epoch": 0.3022480285613402, "grad_norm": 0.126953125, "learning_rate": 0.0002, "loss": 1.4449, "step": 963 }, { "epoch": 0.3025618894425046, "grad_norm": 0.1259765625, "learning_rate": 0.0002, "loss": 1.4368, "step": 964 }, { "epoch": 0.30287575032366904, "grad_norm": 0.1103515625, "learning_rate": 0.0002, "loss": 1.2957, "step": 965 }, { "epoch": 0.30318961120483345, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.467, "step": 966 }, { "epoch": 0.30350347208599787, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.4731, "step": 967 }, { "epoch": 0.3038173329671623, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.4283, "step": 968 }, { "epoch": 0.3041311938483267, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.664, "step": 969 }, { "epoch": 0.3044450547294911, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5262, "step": 970 }, { "epoch": 0.3047589156106556, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.361, "step": 971 }, { "epoch": 0.30507277649182, "grad_norm": 0.1943359375, "learning_rate": 0.0002, "loss": 1.559, "step": 972 }, { "epoch": 0.30538663737298444, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.6235, "step": 973 }, { "epoch": 0.30570049825414886, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 1.7893, "step": 974 }, { "epoch": 0.3060143591353133, "grad_norm": 0.220703125, "learning_rate": 0.0002, "loss": 1.759, "step": 975 }, { "epoch": 0.3063282200164777, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.7774, "step": 976 }, { "epoch": 0.3066420808976421, "grad_norm": 0.236328125, "learning_rate": 0.0002, "loss": 1.7255, "step": 977 }, { "epoch": 0.30695594177880653, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 1.8652, "step": 978 }, { "epoch": 0.30726980265997095, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.5051, "step": 979 }, { "epoch": 0.30758366354113537, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 2.0593, "step": 980 }, { "epoch": 0.30789752442229984, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 2.689, "step": 981 }, { "epoch": 0.30821138530346426, "grad_norm": 0.24609375, "learning_rate": 0.0002, "loss": 1.5818, "step": 982 }, { "epoch": 0.3085252461846287, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 1.8198, "step": 983 }, { "epoch": 0.3088391070657931, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 2.2934, "step": 984 }, { "epoch": 0.3091529679469575, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 2.2926, "step": 985 }, { "epoch": 0.30946682882812193, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 2.5109, "step": 986 }, { "epoch": 0.30978068970928635, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 2.0915, "step": 987 }, { "epoch": 0.31009455059045077, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 2.1578, "step": 988 }, { "epoch": 0.3104084114716152, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 2.6873, "step": 989 }, { "epoch": 0.3107222723527796, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.9798, "step": 990 }, { "epoch": 0.3110361332339441, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 1.9699, "step": 991 }, { "epoch": 0.3113499941151085, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 2.0929, "step": 992 }, { "epoch": 0.3116638549962729, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 2.7644, "step": 993 }, { "epoch": 0.31197771587743733, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 2.3392, "step": 994 }, { "epoch": 0.31229157675860175, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 2.7899, "step": 995 }, { "epoch": 0.31260543763976617, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 2.5258, "step": 996 }, { "epoch": 0.3129192985209306, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 2.4348, "step": 997 }, { "epoch": 0.313233159402095, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 2.0247, "step": 998 }, { "epoch": 0.3135470202832594, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 2.4701, "step": 999 }, { "epoch": 0.31386088116442384, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 2.5099, "step": 1000 }, { "epoch": 0.3141747420455883, "grad_norm": 0.09716796875, "learning_rate": 0.0002, "loss": 1.4304, "step": 1001 }, { "epoch": 0.31448860292675274, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.3457, "step": 1002 }, { "epoch": 0.31480246380791715, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.5733, "step": 1003 }, { "epoch": 0.3151163246890816, "grad_norm": 0.11865234375, "learning_rate": 0.0002, "loss": 1.4797, "step": 1004 }, { "epoch": 0.315430185570246, "grad_norm": 0.1279296875, "learning_rate": 0.0002, "loss": 1.3706, "step": 1005 }, { "epoch": 0.3157440464514104, "grad_norm": 0.10546875, "learning_rate": 0.0002, "loss": 1.3173, "step": 1006 }, { "epoch": 0.31605790733257483, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.555, "step": 1007 }, { "epoch": 0.31637176821373925, "grad_norm": 0.115234375, "learning_rate": 0.0002, "loss": 1.3216, "step": 1008 }, { "epoch": 0.31668562909490366, "grad_norm": 0.11572265625, "learning_rate": 0.0002, "loss": 1.4556, "step": 1009 }, { "epoch": 0.3169994899760681, "grad_norm": 0.1015625, "learning_rate": 0.0002, "loss": 1.197, "step": 1010 }, { "epoch": 0.31731335085723256, "grad_norm": 0.1171875, "learning_rate": 0.0002, "loss": 1.3905, "step": 1011 }, { "epoch": 0.317627211738397, "grad_norm": 0.11181640625, "learning_rate": 0.0002, "loss": 1.4539, "step": 1012 }, { "epoch": 0.3179410726195614, "grad_norm": 0.12451171875, "learning_rate": 0.0002, "loss": 1.3952, "step": 1013 }, { "epoch": 0.3182549335007258, "grad_norm": 0.12890625, "learning_rate": 0.0002, "loss": 1.4926, "step": 1014 }, { "epoch": 0.31856879438189023, "grad_norm": 0.1943359375, "learning_rate": 0.0002, "loss": 1.5527, "step": 1015 }, { "epoch": 0.31888265526305465, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5856, "step": 1016 }, { "epoch": 0.31919651614421907, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5755, "step": 1017 }, { "epoch": 0.3195103770253835, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.2928, "step": 1018 }, { "epoch": 0.3198242379065479, "grad_norm": 0.1240234375, "learning_rate": 0.0002, "loss": 1.3928, "step": 1019 }, { "epoch": 0.3201380987877123, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5289, "step": 1020 }, { "epoch": 0.3204519596688768, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.5257, "step": 1021 }, { "epoch": 0.3207658205500412, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.662, "step": 1022 }, { "epoch": 0.32107968143120563, "grad_norm": 0.2060546875, "learning_rate": 0.0002, "loss": 1.5929, "step": 1023 }, { "epoch": 0.32139354231237005, "grad_norm": 0.1953125, "learning_rate": 0.0002, "loss": 1.6586, "step": 1024 }, { "epoch": 0.32170740319353447, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.4007, "step": 1025 }, { "epoch": 0.3220212640746989, "grad_norm": 0.205078125, "learning_rate": 0.0002, "loss": 1.6509, "step": 1026 }, { "epoch": 0.3223351249558633, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.7426, "step": 1027 }, { "epoch": 0.3226489858370277, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 2.0932, "step": 1028 }, { "epoch": 0.32296284671819214, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.7658, "step": 1029 }, { "epoch": 0.32327670759935656, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.8366, "step": 1030 }, { "epoch": 0.32359056848052103, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.76, "step": 1031 }, { "epoch": 0.32390442936168545, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 2.0031, "step": 1032 }, { "epoch": 0.32421829024284987, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.9684, "step": 1033 }, { "epoch": 0.3245321511240143, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.6309, "step": 1034 }, { "epoch": 0.3248460120051787, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 2.1617, "step": 1035 }, { "epoch": 0.3251598728863431, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 2.5695, "step": 1036 }, { "epoch": 0.32547373376750754, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 2.3536, "step": 1037 }, { "epoch": 0.32578759464867196, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 2.6076, "step": 1038 }, { "epoch": 0.3261014555298364, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.6511, "step": 1039 }, { "epoch": 0.3264153164110008, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 2.3847, "step": 1040 }, { "epoch": 0.3267291772921653, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.8911, "step": 1041 }, { "epoch": 0.3270430381733297, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.8818, "step": 1042 }, { "epoch": 0.3273568990544941, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 2.3726, "step": 1043 }, { "epoch": 0.32767075993565853, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 2.7633, "step": 1044 }, { "epoch": 0.32798462081682295, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.7109, "step": 1045 }, { "epoch": 0.32829848169798737, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 2.6085, "step": 1046 }, { "epoch": 0.3286123425791518, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 2.2521, "step": 1047 }, { "epoch": 0.3289262034603162, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 2.1469, "step": 1048 }, { "epoch": 0.3292400643414806, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 2.7812, "step": 1049 }, { "epoch": 0.32955392522264504, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 2.6907, "step": 1050 }, { "epoch": 0.3298677861038095, "grad_norm": 0.08056640625, "learning_rate": 0.0002, "loss": 1.3973, "step": 1051 }, { "epoch": 0.33018164698497393, "grad_norm": 0.09521484375, "learning_rate": 0.0002, "loss": 1.2347, "step": 1052 }, { "epoch": 0.33049550786613835, "grad_norm": 0.1123046875, "learning_rate": 0.0002, "loss": 1.38, "step": 1053 }, { "epoch": 0.33080936874730277, "grad_norm": 0.130859375, "learning_rate": 0.0002, "loss": 1.3663, "step": 1054 }, { "epoch": 0.3311232296284672, "grad_norm": 0.1328125, "learning_rate": 0.0002, "loss": 1.4329, "step": 1055 }, { "epoch": 0.3314370905096316, "grad_norm": 0.11572265625, "learning_rate": 0.0002, "loss": 1.396, "step": 1056 }, { "epoch": 0.331750951390796, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.652, "step": 1057 }, { "epoch": 0.33206481227196044, "grad_norm": 0.189453125, "learning_rate": 0.0002, "loss": 1.644, "step": 1058 }, { "epoch": 0.33237867315312486, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.4038, "step": 1059 }, { "epoch": 0.3326925340342893, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.4993, "step": 1060 }, { "epoch": 0.33300639491545375, "grad_norm": 0.11572265625, "learning_rate": 0.0002, "loss": 1.3896, "step": 1061 }, { "epoch": 0.33332025579661817, "grad_norm": 0.130859375, "learning_rate": 0.0002, "loss": 1.5448, "step": 1062 }, { "epoch": 0.3336341166777826, "grad_norm": 0.115234375, "learning_rate": 0.0002, "loss": 1.409, "step": 1063 }, { "epoch": 0.333947977558947, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.3842, "step": 1064 }, { "epoch": 0.3342618384401114, "grad_norm": 0.1240234375, "learning_rate": 0.0002, "loss": 1.3284, "step": 1065 }, { "epoch": 0.33457569932127584, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 1.6964, "step": 1066 }, { "epoch": 0.33488956020244026, "grad_norm": 0.2099609375, "learning_rate": 0.0002, "loss": 1.6085, "step": 1067 }, { "epoch": 0.3352034210836047, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5205, "step": 1068 }, { "epoch": 0.3355172819647691, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.3477, "step": 1069 }, { "epoch": 0.3358311428459335, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.4518, "step": 1070 }, { "epoch": 0.336145003727098, "grad_norm": 0.2041015625, "learning_rate": 0.0002, "loss": 1.7597, "step": 1071 }, { "epoch": 0.3364588646082624, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.4113, "step": 1072 }, { "epoch": 0.3367727254894268, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.4123, "step": 1073 }, { "epoch": 0.33708658637059125, "grad_norm": 0.1845703125, "learning_rate": 0.0002, "loss": 1.471, "step": 1074 }, { "epoch": 0.33740044725175566, "grad_norm": 0.220703125, "learning_rate": 0.0002, "loss": 1.4666, "step": 1075 }, { "epoch": 0.3377143081329201, "grad_norm": 0.2158203125, "learning_rate": 0.0002, "loss": 1.956, "step": 1076 }, { "epoch": 0.3380281690140845, "grad_norm": 0.2158203125, "learning_rate": 0.0002, "loss": 1.6647, "step": 1077 }, { "epoch": 0.3383420298952489, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.8713, "step": 1078 }, { "epoch": 0.33865589077641334, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.8572, "step": 1079 }, { "epoch": 0.33896975165757776, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.9718, "step": 1080 }, { "epoch": 0.33928361253874223, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.8293, "step": 1081 }, { "epoch": 0.33959747341990665, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.9903, "step": 1082 }, { "epoch": 0.33991133430107107, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 2.1682, "step": 1083 }, { "epoch": 0.3402251951822355, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 2.1994, "step": 1084 }, { "epoch": 0.3405390560633999, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 2.4478, "step": 1085 }, { "epoch": 0.3408529169445643, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.9303, "step": 1086 }, { "epoch": 0.34116677782572874, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 2.002, "step": 1087 }, { "epoch": 0.34148063870689316, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 2.3037, "step": 1088 }, { "epoch": 0.3417944995880576, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 2.3346, "step": 1089 }, { "epoch": 0.342108360469222, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 2.318, "step": 1090 }, { "epoch": 0.34242222135038647, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 2.101, "step": 1091 }, { "epoch": 0.3427360822315509, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 2.2494, "step": 1092 }, { "epoch": 0.3430499431127153, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 2.2361, "step": 1093 }, { "epoch": 0.3433638039938797, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 1.8695, "step": 1094 }, { "epoch": 0.34367766487504414, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 2.2861, "step": 1095 }, { "epoch": 0.34399152575620856, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 2.7394, "step": 1096 }, { "epoch": 0.344305386637373, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.768, "step": 1097 }, { "epoch": 0.3446192475185374, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.6265, "step": 1098 }, { "epoch": 0.3449331083997018, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 2.6331, "step": 1099 }, { "epoch": 0.34524696928086623, "grad_norm": 1.21875, "learning_rate": 0.0002, "loss": 3.8936, "step": 1100 }, { "epoch": 0.34556083016203065, "grad_norm": 0.080078125, "learning_rate": 0.0002, "loss": 1.3829, "step": 1101 }, { "epoch": 0.3458746910431951, "grad_norm": 0.095703125, "learning_rate": 0.0002, "loss": 1.3983, "step": 1102 }, { "epoch": 0.34618855192435954, "grad_norm": 0.11669921875, "learning_rate": 0.0002, "loss": 1.5221, "step": 1103 }, { "epoch": 0.34650241280552396, "grad_norm": 0.10205078125, "learning_rate": 0.0002, "loss": 1.3643, "step": 1104 }, { "epoch": 0.3468162736866884, "grad_norm": 0.1259765625, "learning_rate": 0.0002, "loss": 1.5967, "step": 1105 }, { "epoch": 0.3471301345678528, "grad_norm": 0.115234375, "learning_rate": 0.0002, "loss": 1.4927, "step": 1106 }, { "epoch": 0.3474439954490172, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.4529, "step": 1107 }, { "epoch": 0.34775785633018164, "grad_norm": 0.1845703125, "learning_rate": 0.0002, "loss": 1.2427, "step": 1108 }, { "epoch": 0.34807171721134605, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.3519, "step": 1109 }, { "epoch": 0.3483855780925105, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.4845, "step": 1110 }, { "epoch": 0.3486994389736749, "grad_norm": 0.1201171875, "learning_rate": 0.0002, "loss": 1.4421, "step": 1111 }, { "epoch": 0.34901329985483937, "grad_norm": 0.11669921875, "learning_rate": 0.0002, "loss": 1.2866, "step": 1112 }, { "epoch": 0.3493271607360038, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5621, "step": 1113 }, { "epoch": 0.3496410216171682, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.3539, "step": 1114 }, { "epoch": 0.3499548824983326, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.4143, "step": 1115 }, { "epoch": 0.35026874337949704, "grad_norm": 0.123046875, "learning_rate": 0.0002, "loss": 1.4976, "step": 1116 }, { "epoch": 0.35058260426066146, "grad_norm": 0.1259765625, "learning_rate": 0.0002, "loss": 1.3493, "step": 1117 }, { "epoch": 0.3508964651418259, "grad_norm": 0.126953125, "learning_rate": 0.0002, "loss": 1.3277, "step": 1118 }, { "epoch": 0.3512103260229903, "grad_norm": 0.130859375, "learning_rate": 0.0002, "loss": 1.4038, "step": 1119 }, { "epoch": 0.3515241869041547, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5456, "step": 1120 }, { "epoch": 0.35183804778531913, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.4138, "step": 1121 }, { "epoch": 0.3521519086664836, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.4485, "step": 1122 }, { "epoch": 0.3521519086664836, "eval_loss": 1.7991268634796143, "eval_runtime": 123.0627, "eval_samples_per_second": 8.126, "eval_steps_per_second": 8.126, "step": 1122 }, { "epoch": 0.3521519086664836, "mmlu_eval_accuracy": 0.4303232791485502, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, "mmlu_eval_accuracy_clinical_knowledge": 0.3793103448275862, "mmlu_eval_accuracy_college_biology": 0.25, "mmlu_eval_accuracy_college_chemistry": 0.25, "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454, "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.5454545454545454, "mmlu_eval_accuracy_conceptual_physics": 0.23076923076923078, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.4375, "mmlu_eval_accuracy_elementary_mathematics": 0.34146341463414637, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.375, "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556, "mmlu_eval_accuracy_high_school_european_history": 0.6666666666666666, "mmlu_eval_accuracy_high_school_geography": 0.6363636363636364, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5714285714285714, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3488372093023256, "mmlu_eval_accuracy_high_school_mathematics": 0.3103448275862069, "mmlu_eval_accuracy_high_school_microeconomics": 0.2692307692307692, "mmlu_eval_accuracy_high_school_physics": 0.29411764705882354, "mmlu_eval_accuracy_high_school_psychology": 0.6166666666666667, "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, "mmlu_eval_accuracy_high_school_us_history": 0.5909090909090909, "mmlu_eval_accuracy_high_school_world_history": 0.5384615384615384, "mmlu_eval_accuracy_human_aging": 0.5652173913043478, "mmlu_eval_accuracy_human_sexuality": 0.5, "mmlu_eval_accuracy_international_law": 0.7692307692307693, "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, "mmlu_eval_accuracy_logical_fallacies": 0.6111111111111112, "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, "mmlu_eval_accuracy_management": 0.45454545454545453, "mmlu_eval_accuracy_marketing": 0.8, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.5930232558139535, "mmlu_eval_accuracy_moral_disputes": 0.47368421052631576, "mmlu_eval_accuracy_moral_scenarios": 0.23, "mmlu_eval_accuracy_nutrition": 0.45454545454545453, "mmlu_eval_accuracy_philosophy": 0.47058823529411764, "mmlu_eval_accuracy_prehistory": 0.2857142857142857, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.3352941176470588, "mmlu_eval_accuracy_professional_medicine": 0.3548387096774194, "mmlu_eval_accuracy_professional_psychology": 0.3188405797101449, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.4074074074074074, "mmlu_eval_accuracy_sociology": 0.5909090909090909, "mmlu_eval_accuracy_us_foreign_policy": 0.6363636363636364, "mmlu_eval_accuracy_virology": 0.3333333333333333, "mmlu_eval_accuracy_world_religions": 0.631578947368421, "mmlu_loss": 1.4021799655656733, "step": 1122 }, { "epoch": 0.352465769547648, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.6944, "step": 1123 }, { "epoch": 0.35277963042881244, "grad_norm": 0.220703125, "learning_rate": 0.0002, "loss": 1.4365, "step": 1124 }, { "epoch": 0.35309349130997686, "grad_norm": 0.2236328125, "learning_rate": 0.0002, "loss": 1.7668, "step": 1125 }, { "epoch": 0.3534073521911413, "grad_norm": 0.240234375, "learning_rate": 0.0002, "loss": 1.7494, "step": 1126 }, { "epoch": 0.3537212130723057, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.7577, "step": 1127 }, { "epoch": 0.3540350739534701, "grad_norm": 0.220703125, "learning_rate": 0.0002, "loss": 1.5836, "step": 1128 }, { "epoch": 0.35434893483463453, "grad_norm": 0.2470703125, "learning_rate": 0.0002, "loss": 1.7775, "step": 1129 }, { "epoch": 0.35466279571579895, "grad_norm": 0.2333984375, "learning_rate": 0.0002, "loss": 1.7426, "step": 1130 }, { "epoch": 0.35497665659696337, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.7906, "step": 1131 }, { "epoch": 0.35529051747812784, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.7566, "step": 1132 }, { "epoch": 0.35560437835929226, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 2.2073, "step": 1133 }, { "epoch": 0.3559182392404567, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 2.885, "step": 1134 }, { "epoch": 0.3562321001216211, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 2.2592, "step": 1135 }, { "epoch": 0.3565459610027855, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 2.0171, "step": 1136 }, { "epoch": 0.35685982188394993, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 2.4336, "step": 1137 }, { "epoch": 0.35717368276511435, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 2.1057, "step": 1138 }, { "epoch": 0.35748754364627877, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.6518, "step": 1139 }, { "epoch": 0.3578014045274432, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.7937, "step": 1140 }, { "epoch": 0.3581152654086076, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 2.2845, "step": 1141 }, { "epoch": 0.3584291262897721, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 2.41, "step": 1142 }, { "epoch": 0.3587429871709365, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 2.1395, "step": 1143 }, { "epoch": 0.3590568480521009, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 2.2529, "step": 1144 }, { "epoch": 0.35937070893326534, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 2.7725, "step": 1145 }, { "epoch": 0.35968456981442976, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 2.3379, "step": 1146 }, { "epoch": 0.3599984306955942, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.8914, "step": 1147 }, { "epoch": 0.3603122915767586, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 2.4866, "step": 1148 }, { "epoch": 0.360626152457923, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 1.8759, "step": 1149 }, { "epoch": 0.36094001333908743, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 2.9355, "step": 1150 }, { "epoch": 0.36125387422025185, "grad_norm": 0.1142578125, "learning_rate": 0.0002, "loss": 1.5055, "step": 1151 }, { "epoch": 0.3615677351014163, "grad_norm": 0.09130859375, "learning_rate": 0.0002, "loss": 1.382, "step": 1152 }, { "epoch": 0.36188159598258074, "grad_norm": 0.107421875, "learning_rate": 0.0002, "loss": 1.3176, "step": 1153 }, { "epoch": 0.36219545686374516, "grad_norm": 0.11669921875, "learning_rate": 0.0002, "loss": 1.4593, "step": 1154 }, { "epoch": 0.3625093177449096, "grad_norm": 0.11767578125, "learning_rate": 0.0002, "loss": 1.385, "step": 1155 }, { "epoch": 0.362823178626074, "grad_norm": 0.11328125, "learning_rate": 0.0002, "loss": 1.3072, "step": 1156 }, { "epoch": 0.3631370395072384, "grad_norm": 0.11669921875, "learning_rate": 0.0002, "loss": 1.2866, "step": 1157 }, { "epoch": 0.36345090038840283, "grad_norm": 0.1123046875, "learning_rate": 0.0002, "loss": 1.3623, "step": 1158 }, { "epoch": 0.36376476126956725, "grad_norm": 0.119140625, "learning_rate": 0.0002, "loss": 1.3802, "step": 1159 }, { "epoch": 0.36407862215073167, "grad_norm": 0.10791015625, "learning_rate": 0.0002, "loss": 1.4173, "step": 1160 }, { "epoch": 0.3643924830318961, "grad_norm": 0.11865234375, "learning_rate": 0.0002, "loss": 1.4814, "step": 1161 }, { "epoch": 0.36470634391306056, "grad_norm": 0.11962890625, "learning_rate": 0.0002, "loss": 1.4864, "step": 1162 }, { "epoch": 0.365020204794225, "grad_norm": 0.109375, "learning_rate": 0.0002, "loss": 1.4863, "step": 1163 }, { "epoch": 0.3653340656753894, "grad_norm": 0.12060546875, "learning_rate": 0.0002, "loss": 1.4433, "step": 1164 }, { "epoch": 0.3656479265565538, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.4984, "step": 1165 }, { "epoch": 0.36596178743771823, "grad_norm": 0.1259765625, "learning_rate": 0.0002, "loss": 1.4259, "step": 1166 }, { "epoch": 0.36627564831888265, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.566, "step": 1167 }, { "epoch": 0.36658950920004707, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.415, "step": 1168 }, { "epoch": 0.3669033700812115, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.7164, "step": 1169 }, { "epoch": 0.3672172309623759, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5214, "step": 1170 }, { "epoch": 0.3675310918435403, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.6026, "step": 1171 }, { "epoch": 0.3678449527247048, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.5882, "step": 1172 }, { "epoch": 0.3681588136058692, "grad_norm": 0.19140625, "learning_rate": 0.0002, "loss": 1.6785, "step": 1173 }, { "epoch": 0.36847267448703364, "grad_norm": 0.19140625, "learning_rate": 0.0002, "loss": 1.4065, "step": 1174 }, { "epoch": 0.36878653536819805, "grad_norm": 0.2216796875, "learning_rate": 0.0002, "loss": 1.7289, "step": 1175 }, { "epoch": 0.3691003962493625, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 2.0728, "step": 1176 }, { "epoch": 0.3694142571305269, "grad_norm": 0.2470703125, "learning_rate": 0.0002, "loss": 1.5221, "step": 1177 }, { "epoch": 0.3697281180116913, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 2.0874, "step": 1178 }, { "epoch": 0.3700419788928557, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.9565, "step": 1179 }, { "epoch": 0.37035583977402015, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.9323, "step": 1180 }, { "epoch": 0.37066970065518456, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.9522, "step": 1181 }, { "epoch": 0.37098356153634904, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 2.0937, "step": 1182 }, { "epoch": 0.37129742241751346, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.8839, "step": 1183 }, { "epoch": 0.3716112832986779, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.8556, "step": 1184 }, { "epoch": 0.3719251441798423, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 2.0557, "step": 1185 }, { "epoch": 0.3722390050610067, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 2.3553, "step": 1186 }, { "epoch": 0.37255286594217113, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 2.1199, "step": 1187 }, { "epoch": 0.37286672682333555, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 2.1553, "step": 1188 }, { "epoch": 0.37318058770449997, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 2.7584, "step": 1189 }, { "epoch": 0.3734944485856644, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 2.5655, "step": 1190 }, { "epoch": 0.3738083094668288, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 2.7206, "step": 1191 }, { "epoch": 0.3741221703479933, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 2.2983, "step": 1192 }, { "epoch": 0.3744360312291577, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 2.5373, "step": 1193 }, { "epoch": 0.3747498921103221, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 2.1976, "step": 1194 }, { "epoch": 0.37506375299148653, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 1.7856, "step": 1195 }, { "epoch": 0.37537761387265095, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 2.1118, "step": 1196 }, { "epoch": 0.37569147475381537, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 2.2917, "step": 1197 }, { "epoch": 0.3760053356349798, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 2.2105, "step": 1198 }, { "epoch": 0.3763191965161442, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 2.2213, "step": 1199 }, { "epoch": 0.3766330573973086, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 2.4509, "step": 1200 }, { "epoch": 0.37694691827847304, "grad_norm": 0.099609375, "learning_rate": 0.0002, "loss": 1.4907, "step": 1201 }, { "epoch": 0.3772607791596375, "grad_norm": 0.1259765625, "learning_rate": 0.0002, "loss": 1.4751, "step": 1202 }, { "epoch": 0.37757464004080193, "grad_norm": 0.12158203125, "learning_rate": 0.0002, "loss": 1.41, "step": 1203 }, { "epoch": 0.37788850092196635, "grad_norm": 0.1279296875, "learning_rate": 0.0002, "loss": 1.4708, "step": 1204 }, { "epoch": 0.37820236180313077, "grad_norm": 0.126953125, "learning_rate": 0.0002, "loss": 1.4557, "step": 1205 }, { "epoch": 0.3785162226842952, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.4991, "step": 1206 }, { "epoch": 0.3788300835654596, "grad_norm": 0.1259765625, "learning_rate": 0.0002, "loss": 1.4234, "step": 1207 }, { "epoch": 0.379143944446624, "grad_norm": 0.11865234375, "learning_rate": 0.0002, "loss": 1.3855, "step": 1208 }, { "epoch": 0.37945780532778844, "grad_norm": 0.1279296875, "learning_rate": 0.0002, "loss": 1.4934, "step": 1209 }, { "epoch": 0.37977166620895286, "grad_norm": 0.12890625, "learning_rate": 0.0002, "loss": 1.4267, "step": 1210 }, { "epoch": 0.3800855270901173, "grad_norm": 0.1328125, "learning_rate": 0.0002, "loss": 1.443, "step": 1211 }, { "epoch": 0.38039938797128175, "grad_norm": 0.1162109375, "learning_rate": 0.0002, "loss": 1.352, "step": 1212 }, { "epoch": 0.3807132488524462, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.3038, "step": 1213 }, { "epoch": 0.3810271097336106, "grad_norm": 0.12255859375, "learning_rate": 0.0002, "loss": 1.2391, "step": 1214 }, { "epoch": 0.381340970614775, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.4306, "step": 1215 }, { "epoch": 0.38165483149593943, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.588, "step": 1216 }, { "epoch": 0.38196869237710385, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.3168, "step": 1217 }, { "epoch": 0.38228255325826827, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.5673, "step": 1218 }, { "epoch": 0.3825964141394327, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.44, "step": 1219 }, { "epoch": 0.3829102750205971, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.3603, "step": 1220 }, { "epoch": 0.3832241359017615, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.5589, "step": 1221 }, { "epoch": 0.383537996782926, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.4931, "step": 1222 }, { "epoch": 0.3838518576640904, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.42, "step": 1223 }, { "epoch": 0.38416571854525483, "grad_norm": 0.197265625, "learning_rate": 0.0002, "loss": 1.6192, "step": 1224 }, { "epoch": 0.38447957942641925, "grad_norm": 0.2236328125, "learning_rate": 0.0002, "loss": 1.7035, "step": 1225 }, { "epoch": 0.38479344030758367, "grad_norm": 0.220703125, "learning_rate": 0.0002, "loss": 1.5969, "step": 1226 }, { "epoch": 0.3851073011887481, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 1.8406, "step": 1227 }, { "epoch": 0.3854211620699125, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 2.1648, "step": 1228 }, { "epoch": 0.3857350229510769, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.9683, "step": 1229 }, { "epoch": 0.38604888383224134, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.817, "step": 1230 }, { "epoch": 0.38636274471340576, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 2.045, "step": 1231 }, { "epoch": 0.38667660559457023, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 2.0243, "step": 1232 }, { "epoch": 0.38699046647573465, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.7933, "step": 1233 }, { "epoch": 0.38730432735689907, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 2.1071, "step": 1234 }, { "epoch": 0.3876181882380635, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 2.33, "step": 1235 }, { "epoch": 0.3879320491192279, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 2.4712, "step": 1236 }, { "epoch": 0.3882459100003923, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.8365, "step": 1237 }, { "epoch": 0.38855977088155674, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 2.4525, "step": 1238 }, { "epoch": 0.38887363176272116, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 2.8756, "step": 1239 }, { "epoch": 0.3891874926438856, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 2.762, "step": 1240 }, { "epoch": 0.38950135352505, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 2.0963, "step": 1241 }, { "epoch": 0.38981521440621447, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 2.136, "step": 1242 }, { "epoch": 0.3901290752873789, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 2.3817, "step": 1243 }, { "epoch": 0.3904429361685433, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 2.2338, "step": 1244 }, { "epoch": 0.3907567970497077, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 1.9984, "step": 1245 }, { "epoch": 0.39107065793087215, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 2.2326, "step": 1246 }, { "epoch": 0.39138451881203656, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 2.0614, "step": 1247 }, { "epoch": 0.391698379693201, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 2.408, "step": 1248 }, { "epoch": 0.3920122405743654, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 2.4836, "step": 1249 }, { "epoch": 0.3923261014555298, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 2.8903, "step": 1250 }, { "epoch": 0.39263996233669424, "grad_norm": 0.08056640625, "learning_rate": 0.0002, "loss": 1.3626, "step": 1251 }, { "epoch": 0.3929538232178587, "grad_norm": 0.11962890625, "learning_rate": 0.0002, "loss": 1.4351, "step": 1252 }, { "epoch": 0.39326768409902313, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.4794, "step": 1253 }, { "epoch": 0.39358154498018755, "grad_norm": 0.130859375, "learning_rate": 0.0002, "loss": 1.6374, "step": 1254 }, { "epoch": 0.39389540586135197, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.4264, "step": 1255 }, { "epoch": 0.3942092667425164, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.3838, "step": 1256 }, { "epoch": 0.3945231276236808, "grad_norm": 0.12158203125, "learning_rate": 0.0002, "loss": 1.4814, "step": 1257 }, { "epoch": 0.3948369885048452, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.4941, "step": 1258 }, { "epoch": 0.39515084938600964, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.3982, "step": 1259 }, { "epoch": 0.39546471026717406, "grad_norm": 0.189453125, "learning_rate": 0.0002, "loss": 1.5024, "step": 1260 }, { "epoch": 0.3957785711483385, "grad_norm": 0.12890625, "learning_rate": 0.0002, "loss": 1.5613, "step": 1261 }, { "epoch": 0.39609243202950295, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.726, "step": 1262 }, { "epoch": 0.39640629291066737, "grad_norm": 0.126953125, "learning_rate": 0.0002, "loss": 1.5011, "step": 1263 }, { "epoch": 0.3967201537918318, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.4721, "step": 1264 }, { "epoch": 0.3970340146729962, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.6612, "step": 1265 }, { "epoch": 0.3973478755541606, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.4582, "step": 1266 }, { "epoch": 0.39766173643532504, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.4811, "step": 1267 }, { "epoch": 0.39797559731648946, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.4813, "step": 1268 }, { "epoch": 0.3982894581976539, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.567, "step": 1269 }, { "epoch": 0.3986033190788183, "grad_norm": 0.205078125, "learning_rate": 0.0002, "loss": 1.5624, "step": 1270 }, { "epoch": 0.3989171799599827, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.3426, "step": 1271 }, { "epoch": 0.3992310408411472, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.701, "step": 1272 }, { "epoch": 0.3995449017223116, "grad_norm": 0.1923828125, "learning_rate": 0.0002, "loss": 1.5935, "step": 1273 }, { "epoch": 0.399858762603476, "grad_norm": 0.2177734375, "learning_rate": 0.0002, "loss": 1.6688, "step": 1274 }, { "epoch": 0.40017262348464044, "grad_norm": 0.2119140625, "learning_rate": 0.0002, "loss": 1.6152, "step": 1275 }, { "epoch": 0.40048648436580486, "grad_norm": 0.236328125, "learning_rate": 0.0002, "loss": 1.9036, "step": 1276 }, { "epoch": 0.4008003452469693, "grad_norm": 0.2255859375, "learning_rate": 0.0002, "loss": 1.8394, "step": 1277 }, { "epoch": 0.4011142061281337, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 1.8218, "step": 1278 }, { "epoch": 0.4014280670092981, "grad_norm": 0.2392578125, "learning_rate": 0.0002, "loss": 1.7053, "step": 1279 }, { "epoch": 0.40174192789046254, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.6976, "step": 1280 }, { "epoch": 0.40205578877162695, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.8774, "step": 1281 }, { "epoch": 0.40236964965279143, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 2.3061, "step": 1282 }, { "epoch": 0.40268351053395585, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 2.1891, "step": 1283 }, { "epoch": 0.40299737141512026, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 2.3157, "step": 1284 }, { "epoch": 0.4033112322962847, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 2.4991, "step": 1285 }, { "epoch": 0.4036250931774491, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 2.5725, "step": 1286 }, { "epoch": 0.4039389540586135, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.971, "step": 1287 }, { "epoch": 0.40425281493977794, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.7505, "step": 1288 }, { "epoch": 0.40456667582094236, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 1.991, "step": 1289 }, { "epoch": 0.4048805367021068, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 2.1957, "step": 1290 }, { "epoch": 0.4051943975832712, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 2.4308, "step": 1291 }, { "epoch": 0.4055082584644356, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 2.1227, "step": 1292 }, { "epoch": 0.4058221193456001, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 2.5946, "step": 1293 }, { "epoch": 0.4061359802267645, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 2.3162, "step": 1294 }, { "epoch": 0.4064498411079289, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 2.3666, "step": 1295 }, { "epoch": 0.40676370198909334, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 2.0322, "step": 1296 }, { "epoch": 0.40707756287025776, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 2.0175, "step": 1297 }, { "epoch": 0.4073914237514222, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 2.339, "step": 1298 }, { "epoch": 0.4077052846325866, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 2.1715, "step": 1299 }, { "epoch": 0.408019145513751, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 2.9335, "step": 1300 }, { "epoch": 0.40833300639491543, "grad_norm": 0.0810546875, "learning_rate": 0.0002, "loss": 1.3224, "step": 1301 }, { "epoch": 0.40864686727607985, "grad_norm": 0.0947265625, "learning_rate": 0.0002, "loss": 1.3435, "step": 1302 }, { "epoch": 0.4089607281572443, "grad_norm": 0.1162109375, "learning_rate": 0.0002, "loss": 1.4457, "step": 1303 }, { "epoch": 0.40927458903840874, "grad_norm": 0.11474609375, "learning_rate": 0.0002, "loss": 1.4359, "step": 1304 }, { "epoch": 0.40958844991957316, "grad_norm": 0.1201171875, "learning_rate": 0.0002, "loss": 1.5369, "step": 1305 }, { "epoch": 0.4099023108007376, "grad_norm": 0.115234375, "learning_rate": 0.0002, "loss": 1.3717, "step": 1306 }, { "epoch": 0.410216171681902, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.6337, "step": 1307 }, { "epoch": 0.4105300325630664, "grad_norm": 0.1123046875, "learning_rate": 0.0002, "loss": 1.5547, "step": 1308 }, { "epoch": 0.41084389344423083, "grad_norm": 0.126953125, "learning_rate": 0.0002, "loss": 1.5146, "step": 1309 }, { "epoch": 0.41084389344423083, "eval_loss": 1.7959522008895874, "eval_runtime": 142.4063, "eval_samples_per_second": 7.022, "eval_steps_per_second": 7.022, "step": 1309 }, { "epoch": 0.41084389344423083, "mmlu_eval_accuracy": 0.42657891675533444, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.4375, "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, "mmlu_eval_accuracy_clinical_knowledge": 0.3793103448275862, "mmlu_eval_accuracy_college_biology": 0.25, "mmlu_eval_accuracy_college_chemistry": 0.25, "mmlu_eval_accuracy_college_computer_science": 0.45454545454545453, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.5454545454545454, "mmlu_eval_accuracy_conceptual_physics": 0.23076923076923078, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.375, "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.4, "mmlu_eval_accuracy_high_school_biology": 0.375, "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556, "mmlu_eval_accuracy_high_school_european_history": 0.5555555555555556, "mmlu_eval_accuracy_high_school_geography": 0.5909090909090909, "mmlu_eval_accuracy_high_school_government_and_politics": 0.42857142857142855, "mmlu_eval_accuracy_high_school_macroeconomics": 0.37209302325581395, "mmlu_eval_accuracy_high_school_mathematics": 0.27586206896551724, "mmlu_eval_accuracy_high_school_microeconomics": 0.2692307692307692, "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, "mmlu_eval_accuracy_high_school_psychology": 0.6166666666666667, "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, "mmlu_eval_accuracy_high_school_us_history": 0.5909090909090909, "mmlu_eval_accuracy_high_school_world_history": 0.5, "mmlu_eval_accuracy_human_aging": 0.5652173913043478, "mmlu_eval_accuracy_human_sexuality": 0.5, "mmlu_eval_accuracy_international_law": 0.7692307692307693, "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, "mmlu_eval_accuracy_logical_fallacies": 0.6111111111111112, "mmlu_eval_accuracy_machine_learning": 0.45454545454545453, "mmlu_eval_accuracy_management": 0.2727272727272727, "mmlu_eval_accuracy_marketing": 0.8, "mmlu_eval_accuracy_medical_genetics": 0.8181818181818182, "mmlu_eval_accuracy_miscellaneous": 0.5697674418604651, "mmlu_eval_accuracy_moral_disputes": 0.5263157894736842, "mmlu_eval_accuracy_moral_scenarios": 0.23, "mmlu_eval_accuracy_nutrition": 0.5151515151515151, "mmlu_eval_accuracy_philosophy": 0.4411764705882353, "mmlu_eval_accuracy_prehistory": 0.34285714285714286, "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, "mmlu_eval_accuracy_professional_law": 0.35294117647058826, "mmlu_eval_accuracy_professional_medicine": 0.3870967741935484, "mmlu_eval_accuracy_professional_psychology": 0.3188405797101449, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.5, "mmlu_eval_accuracy_us_foreign_policy": 0.5454545454545454, "mmlu_eval_accuracy_virology": 0.4444444444444444, "mmlu_eval_accuracy_world_religions": 0.631578947368421, "mmlu_loss": 1.200824461015979, "step": 1309 }, { "epoch": 0.41115775432539525, "grad_norm": 0.103515625, "learning_rate": 0.0002, "loss": 1.3974, "step": 1310 }, { "epoch": 0.41147161520655967, "grad_norm": 0.111328125, "learning_rate": 0.0002, "loss": 1.3121, "step": 1311 }, { "epoch": 0.4117854760877241, "grad_norm": 0.11083984375, "learning_rate": 0.0002, "loss": 1.3059, "step": 1312 }, { "epoch": 0.41209933696888856, "grad_norm": 0.12109375, "learning_rate": 0.0002, "loss": 1.4512, "step": 1313 }, { "epoch": 0.412413197850053, "grad_norm": 0.12890625, "learning_rate": 0.0002, "loss": 1.4743, "step": 1314 }, { "epoch": 0.4127270587312174, "grad_norm": 0.11181640625, "learning_rate": 0.0002, "loss": 1.4525, "step": 1315 }, { "epoch": 0.4130409196123818, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5023, "step": 1316 }, { "epoch": 0.41335478049354624, "grad_norm": 0.125, "learning_rate": 0.0002, "loss": 1.3757, "step": 1317 }, { "epoch": 0.41366864137471065, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5656, "step": 1318 }, { "epoch": 0.4139825022558751, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.6074, "step": 1319 }, { "epoch": 0.4142963631370395, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5414, "step": 1320 }, { "epoch": 0.4146102240182039, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5299, "step": 1321 }, { "epoch": 0.41492408489936833, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.6561, "step": 1322 }, { "epoch": 0.4152379457805328, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.2308, "step": 1323 }, { "epoch": 0.4155518066616972, "grad_norm": 0.197265625, "learning_rate": 0.0002, "loss": 1.6029, "step": 1324 }, { "epoch": 0.41586566754286164, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.4665, "step": 1325 }, { "epoch": 0.41617952842402606, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.6009, "step": 1326 }, { "epoch": 0.4164933893051905, "grad_norm": 0.2333984375, "learning_rate": 0.0002, "loss": 1.7153, "step": 1327 }, { "epoch": 0.4168072501863549, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.8695, "step": 1328 }, { "epoch": 0.4171211110675193, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.9682, "step": 1329 }, { "epoch": 0.41743497194868373, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.8397, "step": 1330 }, { "epoch": 0.41774883282984815, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.594, "step": 1331 }, { "epoch": 0.41806269371101257, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.9119, "step": 1332 }, { "epoch": 0.41837655459217704, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 2.1301, "step": 1333 }, { "epoch": 0.41869041547334146, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.4656, "step": 1334 }, { "epoch": 0.4190042763545059, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 1.9128, "step": 1335 }, { "epoch": 0.4193181372356703, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.8979, "step": 1336 }, { "epoch": 0.4196319981168347, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 2.1664, "step": 1337 }, { "epoch": 0.41994585899799913, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.746, "step": 1338 }, { "epoch": 0.42025971987916355, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 2.2825, "step": 1339 }, { "epoch": 0.42057358076032797, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.7572, "step": 1340 }, { "epoch": 0.4208874416414924, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 2.2997, "step": 1341 }, { "epoch": 0.4212013025226568, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 2.6267, "step": 1342 }, { "epoch": 0.4215151634038213, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 1.9881, "step": 1343 }, { "epoch": 0.4218290242849857, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 2.358, "step": 1344 }, { "epoch": 0.4221428851661501, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.8243, "step": 1345 }, { "epoch": 0.42245674604731454, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 2.2387, "step": 1346 }, { "epoch": 0.42277060692847895, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 2.2483, "step": 1347 }, { "epoch": 0.42308446780964337, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 2.4793, "step": 1348 }, { "epoch": 0.4233983286908078, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 2.668, "step": 1349 }, { "epoch": 0.4237121895719722, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 2.9841, "step": 1350 }, { "epoch": 0.4240260504531366, "grad_norm": 0.0771484375, "learning_rate": 0.0002, "loss": 1.4247, "step": 1351 }, { "epoch": 0.42433991133430105, "grad_norm": 0.087890625, "learning_rate": 0.0002, "loss": 1.2979, "step": 1352 }, { "epoch": 0.4246537722154655, "grad_norm": 0.1171875, "learning_rate": 0.0002, "loss": 1.4912, "step": 1353 }, { "epoch": 0.42496763309662994, "grad_norm": 0.1015625, "learning_rate": 0.0002, "loss": 1.382, "step": 1354 }, { "epoch": 0.42528149397779436, "grad_norm": 0.11474609375, "learning_rate": 0.0002, "loss": 1.345, "step": 1355 }, { "epoch": 0.4255953548589588, "grad_norm": 0.12451171875, "learning_rate": 0.0002, "loss": 1.5246, "step": 1356 }, { "epoch": 0.4259092157401232, "grad_norm": 0.1298828125, "learning_rate": 0.0002, "loss": 1.3958, "step": 1357 }, { "epoch": 0.4262230766212876, "grad_norm": 0.11376953125, "learning_rate": 0.0002, "loss": 1.5113, "step": 1358 }, { "epoch": 0.42653693750245203, "grad_norm": 0.10888671875, "learning_rate": 0.0002, "loss": 1.3832, "step": 1359 }, { "epoch": 0.42685079838361645, "grad_norm": 0.12109375, "learning_rate": 0.0002, "loss": 1.3995, "step": 1360 }, { "epoch": 0.42716465926478087, "grad_norm": 0.126953125, "learning_rate": 0.0002, "loss": 1.3415, "step": 1361 }, { "epoch": 0.4274785201459453, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6801, "step": 1362 }, { "epoch": 0.42779238102710976, "grad_norm": 0.1201171875, "learning_rate": 0.0002, "loss": 1.3269, "step": 1363 }, { "epoch": 0.4281062419082742, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.4729, "step": 1364 }, { "epoch": 0.4284201027894386, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.553, "step": 1365 }, { "epoch": 0.428733963670603, "grad_norm": 0.12353515625, "learning_rate": 0.0002, "loss": 1.4457, "step": 1366 }, { "epoch": 0.42904782455176743, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.4326, "step": 1367 }, { "epoch": 0.42936168543293185, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.6397, "step": 1368 }, { "epoch": 0.42967554631409627, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.2723, "step": 1369 }, { "epoch": 0.4299894071952607, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.4549, "step": 1370 }, { "epoch": 0.4303032680764251, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.3559, "step": 1371 }, { "epoch": 0.4306171289575895, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.5031, "step": 1372 }, { "epoch": 0.430930989838754, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5448, "step": 1373 }, { "epoch": 0.4312448507199184, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.4973, "step": 1374 }, { "epoch": 0.43155871160108283, "grad_norm": 0.2314453125, "learning_rate": 0.0002, "loss": 1.7627, "step": 1375 }, { "epoch": 0.43187257248224725, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 1.8157, "step": 1376 }, { "epoch": 0.43218643336341167, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 1.828, "step": 1377 }, { "epoch": 0.4325002942445761, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.8489, "step": 1378 }, { "epoch": 0.4328141551257405, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.9214, "step": 1379 }, { "epoch": 0.4331280160069049, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.9501, "step": 1380 }, { "epoch": 0.43344187688806934, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.7582, "step": 1381 }, { "epoch": 0.43375573776923376, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.9698, "step": 1382 }, { "epoch": 0.43406959865039824, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 2.0568, "step": 1383 }, { "epoch": 0.43438345953156265, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.9915, "step": 1384 }, { "epoch": 0.4346973204127271, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 2.3223, "step": 1385 }, { "epoch": 0.4350111812938915, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 1.9109, "step": 1386 }, { "epoch": 0.4353250421750559, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 2.236, "step": 1387 }, { "epoch": 0.43563890305622033, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 2.3323, "step": 1388 }, { "epoch": 0.43595276393738475, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 2.3034, "step": 1389 }, { "epoch": 0.43626662481854916, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 2.5363, "step": 1390 }, { "epoch": 0.4365804856997136, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 2.1009, "step": 1391 }, { "epoch": 0.436894346580878, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 2.165, "step": 1392 }, { "epoch": 0.4372082074620425, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 2.0697, "step": 1393 }, { "epoch": 0.4375220683432069, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 1.7393, "step": 1394 }, { "epoch": 0.4378359292243713, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 2.1264, "step": 1395 }, { "epoch": 0.43814979010553573, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 2.3717, "step": 1396 }, { "epoch": 0.43846365098670015, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 2.3081, "step": 1397 }, { "epoch": 0.43877751186786457, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 1.9563, "step": 1398 }, { "epoch": 0.439091372749029, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 2.0758, "step": 1399 }, { "epoch": 0.4394052336301934, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 3.0342, "step": 1400 }, { "epoch": 0.4397190945113578, "grad_norm": 0.07421875, "learning_rate": 0.0002, "loss": 1.3142, "step": 1401 }, { "epoch": 0.44003295539252224, "grad_norm": 0.09716796875, "learning_rate": 0.0002, "loss": 1.1996, "step": 1402 }, { "epoch": 0.4403468162736867, "grad_norm": 0.0986328125, "learning_rate": 0.0002, "loss": 1.39, "step": 1403 }, { "epoch": 0.44066067715485113, "grad_norm": 0.10888671875, "learning_rate": 0.0002, "loss": 1.3667, "step": 1404 }, { "epoch": 0.44097453803601555, "grad_norm": 0.109375, "learning_rate": 0.0002, "loss": 1.4288, "step": 1405 }, { "epoch": 0.44128839891717997, "grad_norm": 0.12109375, "learning_rate": 0.0002, "loss": 1.4433, "step": 1406 }, { "epoch": 0.4416022597983444, "grad_norm": 0.11962890625, "learning_rate": 0.0002, "loss": 1.5323, "step": 1407 }, { "epoch": 0.4419161206795088, "grad_norm": 0.10888671875, "learning_rate": 0.0002, "loss": 1.3991, "step": 1408 }, { "epoch": 0.4422299815606732, "grad_norm": 0.12890625, "learning_rate": 0.0002, "loss": 1.6745, "step": 1409 }, { "epoch": 0.44254384244183764, "grad_norm": 0.10791015625, "learning_rate": 0.0002, "loss": 1.2545, "step": 1410 }, { "epoch": 0.44285770332300206, "grad_norm": 0.125, "learning_rate": 0.0002, "loss": 1.4249, "step": 1411 }, { "epoch": 0.4431715642041665, "grad_norm": 0.10986328125, "learning_rate": 0.0002, "loss": 1.4019, "step": 1412 }, { "epoch": 0.44348542508533095, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5158, "step": 1413 }, { "epoch": 0.44379928596649537, "grad_norm": 0.11572265625, "learning_rate": 0.0002, "loss": 1.5706, "step": 1414 }, { "epoch": 0.4441131468476598, "grad_norm": 0.11865234375, "learning_rate": 0.0002, "loss": 1.3462, "step": 1415 }, { "epoch": 0.4444270077288242, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.354, "step": 1416 }, { "epoch": 0.4447408686099886, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.71, "step": 1417 }, { "epoch": 0.44505472949115304, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.3592, "step": 1418 }, { "epoch": 0.44536859037231746, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.5735, "step": 1419 }, { "epoch": 0.4456824512534819, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.2627, "step": 1420 }, { "epoch": 0.4459963121346463, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.4576, "step": 1421 }, { "epoch": 0.4463101730158107, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.5926, "step": 1422 }, { "epoch": 0.4466240338969752, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5425, "step": 1423 }, { "epoch": 0.4469378947781396, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.444, "step": 1424 }, { "epoch": 0.44725175565930403, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.5796, "step": 1425 }, { "epoch": 0.44756561654046845, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.8711, "step": 1426 }, { "epoch": 0.44787947742163287, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.9377, "step": 1427 }, { "epoch": 0.4481933383027973, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 2.199, "step": 1428 }, { "epoch": 0.4485071991839617, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 1.8875, "step": 1429 }, { "epoch": 0.4488210600651261, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 2.3309, "step": 1430 }, { "epoch": 0.44913492094629054, "grad_norm": 0.234375, "learning_rate": 0.0002, "loss": 1.6421, "step": 1431 }, { "epoch": 0.44944878182745496, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 2.215, "step": 1432 }, { "epoch": 0.44976264270861943, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 2.2454, "step": 1433 }, { "epoch": 0.45007650358978385, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 2.5037, "step": 1434 }, { "epoch": 0.45039036447094827, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 2.1748, "step": 1435 }, { "epoch": 0.4507042253521127, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 2.5059, "step": 1436 }, { "epoch": 0.4510180862332771, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 2.2051, "step": 1437 }, { "epoch": 0.4513319471144415, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 2.0578, "step": 1438 }, { "epoch": 0.45164580799560594, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 2.4286, "step": 1439 }, { "epoch": 0.45195966887677036, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 2.1384, "step": 1440 }, { "epoch": 0.4522735297579348, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 2.4053, "step": 1441 }, { "epoch": 0.4525873906390992, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 2.3378, "step": 1442 }, { "epoch": 0.45290125152026367, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.9454, "step": 1443 }, { "epoch": 0.4532151124014281, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 2.3694, "step": 1444 }, { "epoch": 0.4535289732825925, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 2.3593, "step": 1445 }, { "epoch": 0.4538428341637569, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 2.019, "step": 1446 }, { "epoch": 0.45415669504492134, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 2.2726, "step": 1447 }, { "epoch": 0.45447055592608576, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.8556, "step": 1448 }, { "epoch": 0.4547844168072502, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 2.1626, "step": 1449 }, { "epoch": 0.4550982776884146, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 3.0538, "step": 1450 }, { "epoch": 0.455412138569579, "grad_norm": 0.08837890625, "learning_rate": 0.0002, "loss": 1.3931, "step": 1451 }, { "epoch": 0.45572599945074344, "grad_norm": 0.115234375, "learning_rate": 0.0002, "loss": 1.3111, "step": 1452 }, { "epoch": 0.4560398603319079, "grad_norm": 0.11572265625, "learning_rate": 0.0002, "loss": 1.4333, "step": 1453 }, { "epoch": 0.4563537212130723, "grad_norm": 0.1279296875, "learning_rate": 0.0002, "loss": 1.4416, "step": 1454 }, { "epoch": 0.45666758209423675, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5229, "step": 1455 }, { "epoch": 0.45698144297540116, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5088, "step": 1456 }, { "epoch": 0.4572953038565656, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.4849, "step": 1457 }, { "epoch": 0.45760916473773, "grad_norm": 0.1953125, "learning_rate": 0.0002, "loss": 1.5722, "step": 1458 }, { "epoch": 0.4579230256188944, "grad_norm": 0.1279296875, "learning_rate": 0.0002, "loss": 1.3394, "step": 1459 }, { "epoch": 0.45823688650005884, "grad_norm": 0.126953125, "learning_rate": 0.0002, "loss": 1.4405, "step": 1460 }, { "epoch": 0.45855074738122326, "grad_norm": 0.125, "learning_rate": 0.0002, "loss": 1.4214, "step": 1461 }, { "epoch": 0.4588646082623877, "grad_norm": 0.126953125, "learning_rate": 0.0002, "loss": 1.3724, "step": 1462 }, { "epoch": 0.45917846914355215, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.4223, "step": 1463 }, { "epoch": 0.45949233002471657, "grad_norm": 0.1279296875, "learning_rate": 0.0002, "loss": 1.4021, "step": 1464 }, { "epoch": 0.459806190905881, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.5362, "step": 1465 }, { "epoch": 0.4601200517870454, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.4489, "step": 1466 }, { "epoch": 0.4604339126682098, "grad_norm": 0.1298828125, "learning_rate": 0.0002, "loss": 1.4884, "step": 1467 }, { "epoch": 0.46074777354937424, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.5037, "step": 1468 }, { "epoch": 0.46106163443053866, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.7523, "step": 1469 }, { "epoch": 0.4613754953117031, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.5479, "step": 1470 }, { "epoch": 0.4616893561928675, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.5897, "step": 1471 }, { "epoch": 0.4620032170740319, "grad_norm": 0.24609375, "learning_rate": 0.0002, "loss": 1.7879, "step": 1472 }, { "epoch": 0.46231707795519633, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.5347, "step": 1473 }, { "epoch": 0.4626309388363608, "grad_norm": 0.212890625, "learning_rate": 0.0002, "loss": 1.5437, "step": 1474 }, { "epoch": 0.4629447997175252, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.6741, "step": 1475 }, { "epoch": 0.46325866059868964, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.654, "step": 1476 }, { "epoch": 0.46357252147985406, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 1.951, "step": 1477 }, { "epoch": 0.4638863823610185, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.9001, "step": 1478 }, { "epoch": 0.4642002432421829, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.8554, "step": 1479 }, { "epoch": 0.4645141041233473, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.8504, "step": 1480 }, { "epoch": 0.46482796500451173, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 1.9588, "step": 1481 }, { "epoch": 0.46514182588567615, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 2.1388, "step": 1482 }, { "epoch": 0.46545568676684057, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.8643, "step": 1483 }, { "epoch": 0.46576954764800504, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 2.3198, "step": 1484 }, { "epoch": 0.46608340852916946, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 2.5174, "step": 1485 }, { "epoch": 0.4663972694103339, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 2.1416, "step": 1486 }, { "epoch": 0.4667111302914983, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 2.2324, "step": 1487 }, { "epoch": 0.4670249911726627, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 2.2841, "step": 1488 }, { "epoch": 0.46733885205382714, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 2.5772, "step": 1489 }, { "epoch": 0.46765271293499155, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 2.4692, "step": 1490 }, { "epoch": 0.467966573816156, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 2.1528, "step": 1491 }, { "epoch": 0.4682804346973204, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 2.0464, "step": 1492 }, { "epoch": 0.4685942955784848, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.8042, "step": 1493 }, { "epoch": 0.4689081564596493, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 2.0473, "step": 1494 }, { "epoch": 0.4692220173408137, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 1.8333, "step": 1495 }, { "epoch": 0.4695358782219781, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 2.6745, "step": 1496 }, { "epoch": 0.4695358782219781, "eval_loss": 1.7854562997817993, "eval_runtime": 123.6184, "eval_samples_per_second": 8.089, "eval_steps_per_second": 8.089, "step": 1496 }, { "epoch": 0.4695358782219781, "mmlu_eval_accuracy": 0.4358320976556104, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, "mmlu_eval_accuracy_clinical_knowledge": 0.3448275862068966, "mmlu_eval_accuracy_college_biology": 0.25, "mmlu_eval_accuracy_college_chemistry": 0.375, "mmlu_eval_accuracy_college_computer_science": 0.36363636363636365, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.45454545454545453, "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.5625, "mmlu_eval_accuracy_elementary_mathematics": 0.34146341463414637, "mmlu_eval_accuracy_formal_logic": 0.35714285714285715, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.375, "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556, "mmlu_eval_accuracy_high_school_european_history": 0.6666666666666666, "mmlu_eval_accuracy_high_school_geography": 0.5454545454545454, "mmlu_eval_accuracy_high_school_government_and_politics": 0.42857142857142855, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3488372093023256, "mmlu_eval_accuracy_high_school_mathematics": 0.2413793103448276, "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.6, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.5909090909090909, "mmlu_eval_accuracy_high_school_world_history": 0.5384615384615384, "mmlu_eval_accuracy_human_aging": 0.5652173913043478, "mmlu_eval_accuracy_human_sexuality": 0.5833333333333334, "mmlu_eval_accuracy_international_law": 0.7692307692307693, "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, "mmlu_eval_accuracy_logical_fallacies": 0.5555555555555556, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.6363636363636364, "mmlu_eval_accuracy_marketing": 0.76, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.6046511627906976, "mmlu_eval_accuracy_moral_disputes": 0.4473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.23, "mmlu_eval_accuracy_nutrition": 0.45454545454545453, "mmlu_eval_accuracy_philosophy": 0.5294117647058824, "mmlu_eval_accuracy_prehistory": 0.34285714285714286, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.34705882352941175, "mmlu_eval_accuracy_professional_medicine": 0.3548387096774194, "mmlu_eval_accuracy_professional_psychology": 0.3188405797101449, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.4444444444444444, "mmlu_eval_accuracy_sociology": 0.6363636363636364, "mmlu_eval_accuracy_us_foreign_policy": 0.5454545454545454, "mmlu_eval_accuracy_virology": 0.3888888888888889, "mmlu_eval_accuracy_world_religions": 0.7368421052631579, "mmlu_loss": 1.1939274586352704, "step": 1496 }, { "epoch": 0.46984973910314254, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 2.1995, "step": 1497 }, { "epoch": 0.47016359998430696, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 1.9518, "step": 1498 }, { "epoch": 0.4704774608654714, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 2.5733, "step": 1499 }, { "epoch": 0.4707913217466358, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 3.3822, "step": 1500 }, { "epoch": 0.4711051826278002, "grad_norm": 0.08447265625, "learning_rate": 0.0002, "loss": 1.3619, "step": 1501 }, { "epoch": 0.47141904350896463, "grad_norm": 0.09326171875, "learning_rate": 0.0002, "loss": 1.36, "step": 1502 }, { "epoch": 0.47173290439012905, "grad_norm": 0.111328125, "learning_rate": 0.0002, "loss": 1.3708, "step": 1503 }, { "epoch": 0.4720467652712935, "grad_norm": 0.130859375, "learning_rate": 0.0002, "loss": 1.6959, "step": 1504 }, { "epoch": 0.47236062615245794, "grad_norm": 0.1923828125, "learning_rate": 0.0002, "loss": 1.6331, "step": 1505 }, { "epoch": 0.47267448703362236, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.3806, "step": 1506 }, { "epoch": 0.4729883479147868, "grad_norm": 0.12451171875, "learning_rate": 0.0002, "loss": 1.5572, "step": 1507 }, { "epoch": 0.4733022087959512, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.7204, "step": 1508 }, { "epoch": 0.4736160696771156, "grad_norm": 0.10986328125, "learning_rate": 0.0002, "loss": 1.2484, "step": 1509 }, { "epoch": 0.47392993055828003, "grad_norm": 0.12255859375, "learning_rate": 0.0002, "loss": 1.4883, "step": 1510 }, { "epoch": 0.47424379143944445, "grad_norm": 0.111328125, "learning_rate": 0.0002, "loss": 1.4229, "step": 1511 }, { "epoch": 0.47455765232060887, "grad_norm": 0.12353515625, "learning_rate": 0.0002, "loss": 1.5399, "step": 1512 }, { "epoch": 0.4748715132017733, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.4752, "step": 1513 }, { "epoch": 0.47518537408293776, "grad_norm": 0.1298828125, "learning_rate": 0.0002, "loss": 1.3572, "step": 1514 }, { "epoch": 0.4754992349641022, "grad_norm": 0.1201171875, "learning_rate": 0.0002, "loss": 1.3585, "step": 1515 }, { "epoch": 0.4758130958452666, "grad_norm": 0.12890625, "learning_rate": 0.0002, "loss": 1.3959, "step": 1516 }, { "epoch": 0.476126956726431, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.4074, "step": 1517 }, { "epoch": 0.47644081760759543, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.5638, "step": 1518 }, { "epoch": 0.47675467848875985, "grad_norm": 0.130859375, "learning_rate": 0.0002, "loss": 1.3467, "step": 1519 }, { "epoch": 0.47706853936992427, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.4784, "step": 1520 }, { "epoch": 0.4773824002510887, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.716, "step": 1521 }, { "epoch": 0.4776962611322531, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.3299, "step": 1522 }, { "epoch": 0.4780101220134175, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.4561, "step": 1523 }, { "epoch": 0.478323982894582, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.4871, "step": 1524 }, { "epoch": 0.4786378437757464, "grad_norm": 0.2197265625, "learning_rate": 0.0002, "loss": 1.7243, "step": 1525 }, { "epoch": 0.47895170465691084, "grad_norm": 0.20703125, "learning_rate": 0.0002, "loss": 1.6425, "step": 1526 }, { "epoch": 0.47926556553807526, "grad_norm": 0.2177734375, "learning_rate": 0.0002, "loss": 1.5112, "step": 1527 }, { "epoch": 0.4795794264192397, "grad_norm": 0.208984375, "learning_rate": 0.0002, "loss": 1.6385, "step": 1528 }, { "epoch": 0.4798932873004041, "grad_norm": 0.236328125, "learning_rate": 0.0002, "loss": 1.7025, "step": 1529 }, { "epoch": 0.4802071481815685, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 2.2003, "step": 1530 }, { "epoch": 0.48052100906273293, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 2.3669, "step": 1531 }, { "epoch": 0.48083486994389735, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 2.0398, "step": 1532 }, { "epoch": 0.48114873082506177, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.8588, "step": 1533 }, { "epoch": 0.48146259170622624, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 2.4884, "step": 1534 }, { "epoch": 0.48177645258739066, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 2.7152, "step": 1535 }, { "epoch": 0.4820903134685551, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.975, "step": 1536 }, { "epoch": 0.4824041743497195, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 2.4158, "step": 1537 }, { "epoch": 0.4827180352308839, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 2.0198, "step": 1538 }, { "epoch": 0.48303189611204833, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 2.0766, "step": 1539 }, { "epoch": 0.48334575699321275, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 2.4221, "step": 1540 }, { "epoch": 0.48365961787437717, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 2.1022, "step": 1541 }, { "epoch": 0.4839734787555416, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 2.1633, "step": 1542 }, { "epoch": 0.484287339636706, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 2.2258, "step": 1543 }, { "epoch": 0.4846012005178705, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 2.8711, "step": 1544 }, { "epoch": 0.4849150613990349, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 2.2828, "step": 1545 }, { "epoch": 0.4852289222801993, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 1.8074, "step": 1546 }, { "epoch": 0.48554278316136373, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 2.1619, "step": 1547 }, { "epoch": 0.48585664404252815, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 2.0935, "step": 1548 }, { "epoch": 0.48617050492369257, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 2.1691, "step": 1549 }, { "epoch": 0.486484365804857, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 2.8085, "step": 1550 }, { "epoch": 0.4867982266860214, "grad_norm": 0.08056640625, "learning_rate": 0.0002, "loss": 1.4212, "step": 1551 }, { "epoch": 0.4871120875671858, "grad_norm": 0.09765625, "learning_rate": 0.0002, "loss": 1.3257, "step": 1552 }, { "epoch": 0.48742594844835024, "grad_norm": 0.1171875, "learning_rate": 0.0002, "loss": 1.4715, "step": 1553 }, { "epoch": 0.4877398093295147, "grad_norm": 0.130859375, "learning_rate": 0.0002, "loss": 1.5925, "step": 1554 }, { "epoch": 0.48805367021067914, "grad_norm": 0.119140625, "learning_rate": 0.0002, "loss": 1.4331, "step": 1555 }, { "epoch": 0.48836753109184355, "grad_norm": 0.1123046875, "learning_rate": 0.0002, "loss": 1.4069, "step": 1556 }, { "epoch": 0.488681391973008, "grad_norm": 0.1240234375, "learning_rate": 0.0002, "loss": 1.3879, "step": 1557 }, { "epoch": 0.4889952528541724, "grad_norm": 0.1103515625, "learning_rate": 0.0002, "loss": 1.376, "step": 1558 }, { "epoch": 0.4893091137353368, "grad_norm": 0.107421875, "learning_rate": 0.0002, "loss": 1.3164, "step": 1559 }, { "epoch": 0.4896229746165012, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5005, "step": 1560 }, { "epoch": 0.48993683549766565, "grad_norm": 0.115234375, "learning_rate": 0.0002, "loss": 1.4592, "step": 1561 }, { "epoch": 0.49025069637883006, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.501, "step": 1562 }, { "epoch": 0.4905645572599945, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.4858, "step": 1563 }, { "epoch": 0.49087841814115896, "grad_norm": 0.1181640625, "learning_rate": 0.0002, "loss": 1.4335, "step": 1564 }, { "epoch": 0.4911922790223234, "grad_norm": 0.1298828125, "learning_rate": 0.0002, "loss": 1.4227, "step": 1565 }, { "epoch": 0.4915061399034878, "grad_norm": 0.1318359375, "learning_rate": 0.0002, "loss": 1.5895, "step": 1566 }, { "epoch": 0.4918200007846522, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.4231, "step": 1567 }, { "epoch": 0.49213386166581663, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.4445, "step": 1568 }, { "epoch": 0.49244772254698105, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5881, "step": 1569 }, { "epoch": 0.49276158342814547, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5042, "step": 1570 }, { "epoch": 0.4930754443093099, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.567, "step": 1571 }, { "epoch": 0.4933893051904743, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.6221, "step": 1572 }, { "epoch": 0.4937031660716387, "grad_norm": 0.1845703125, "learning_rate": 0.0002, "loss": 1.6948, "step": 1573 }, { "epoch": 0.4940170269528032, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.597, "step": 1574 }, { "epoch": 0.4943308878339676, "grad_norm": 0.1953125, "learning_rate": 0.0002, "loss": 1.6998, "step": 1575 }, { "epoch": 0.49464474871513203, "grad_norm": 0.2197265625, "learning_rate": 0.0002, "loss": 1.5341, "step": 1576 }, { "epoch": 0.49495860959629645, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.7922, "step": 1577 }, { "epoch": 0.49527247047746087, "grad_norm": 0.2099609375, "learning_rate": 0.0002, "loss": 1.7886, "step": 1578 }, { "epoch": 0.4955863313586253, "grad_norm": 0.236328125, "learning_rate": 0.0002, "loss": 1.7991, "step": 1579 }, { "epoch": 0.4959001922397897, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 2.1349, "step": 1580 }, { "epoch": 0.4962140531209541, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.8186, "step": 1581 }, { "epoch": 0.49652791400211854, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 1.5956, "step": 1582 }, { "epoch": 0.49684177488328296, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 2.3392, "step": 1583 }, { "epoch": 0.49715563576444743, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.8827, "step": 1584 }, { "epoch": 0.49746949664561185, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 2.1126, "step": 1585 }, { "epoch": 0.49778335752677627, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.847, "step": 1586 }, { "epoch": 0.4980972184079407, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 2.3026, "step": 1587 }, { "epoch": 0.4984110792891051, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 2.4587, "step": 1588 }, { "epoch": 0.4987249401702695, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.9976, "step": 1589 }, { "epoch": 0.49903880105143394, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 2.1905, "step": 1590 }, { "epoch": 0.49935266193259836, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 2.089, "step": 1591 }, { "epoch": 0.4996665228137628, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 2.9884, "step": 1592 }, { "epoch": 0.4999803836949272, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 2.4244, "step": 1593 }, { "epoch": 0.5002942445760916, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 2.0699, "step": 1594 }, { "epoch": 0.5006081054572561, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.6804, "step": 1595 }, { "epoch": 0.5009219663384205, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 1.8598, "step": 1596 }, { "epoch": 0.5012358272195849, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.8533, "step": 1597 }, { "epoch": 0.5015496881007493, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 2.03, "step": 1598 }, { "epoch": 0.5018635489819138, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 2.8499, "step": 1599 }, { "epoch": 0.5021774098630782, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 2.8263, "step": 1600 }, { "epoch": 0.5024912707442426, "grad_norm": 0.07275390625, "learning_rate": 0.0002, "loss": 1.3193, "step": 1601 }, { "epoch": 0.5028051316254071, "grad_norm": 0.10546875, "learning_rate": 0.0002, "loss": 1.4002, "step": 1602 }, { "epoch": 0.5031189925065714, "grad_norm": 0.1025390625, "learning_rate": 0.0002, "loss": 1.3441, "step": 1603 }, { "epoch": 0.5034328533877359, "grad_norm": 0.11669921875, "learning_rate": 0.0002, "loss": 1.4197, "step": 1604 }, { "epoch": 0.5037467142689003, "grad_norm": 0.12060546875, "learning_rate": 0.0002, "loss": 1.3579, "step": 1605 }, { "epoch": 0.5040605751500647, "grad_norm": 0.12158203125, "learning_rate": 0.0002, "loss": 1.3139, "step": 1606 }, { "epoch": 0.5043744360312291, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.3955, "step": 1607 }, { "epoch": 0.5046882969123936, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.6143, "step": 1608 }, { "epoch": 0.5050021577935581, "grad_norm": 0.12158203125, "learning_rate": 0.0002, "loss": 1.3207, "step": 1609 }, { "epoch": 0.5053160186747224, "grad_norm": 0.11572265625, "learning_rate": 0.0002, "loss": 1.3933, "step": 1610 }, { "epoch": 0.5056298795558869, "grad_norm": 0.1201171875, "learning_rate": 0.0002, "loss": 1.449, "step": 1611 }, { "epoch": 0.5059437404370513, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.342, "step": 1612 }, { "epoch": 0.5062576013182157, "grad_norm": 0.125, "learning_rate": 0.0002, "loss": 1.457, "step": 1613 }, { "epoch": 0.5065714621993801, "grad_norm": 0.1240234375, "learning_rate": 0.0002, "loss": 1.3511, "step": 1614 }, { "epoch": 0.5068853230805446, "grad_norm": 0.12158203125, "learning_rate": 0.0002, "loss": 1.3069, "step": 1615 }, { "epoch": 0.5071991839617089, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.4269, "step": 1616 }, { "epoch": 0.5075130448428734, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.5465, "step": 1617 }, { "epoch": 0.5078269057240378, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6049, "step": 1618 }, { "epoch": 0.5081407666052022, "grad_norm": 0.126953125, "learning_rate": 0.0002, "loss": 1.411, "step": 1619 }, { "epoch": 0.5084546274863667, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.4713, "step": 1620 }, { "epoch": 0.5087684883675311, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5034, "step": 1621 }, { "epoch": 0.5090823492486956, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.6373, "step": 1622 }, { "epoch": 0.5093962101298599, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.7171, "step": 1623 }, { "epoch": 0.5097100710110244, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.415, "step": 1624 }, { "epoch": 0.5100239318921888, "grad_norm": 0.2255859375, "learning_rate": 0.0002, "loss": 1.8343, "step": 1625 }, { "epoch": 0.5103377927733532, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.869, "step": 1626 }, { "epoch": 0.5106516536545176, "grad_norm": 0.22265625, "learning_rate": 0.0002, "loss": 1.7027, "step": 1627 }, { "epoch": 0.5109655145356821, "grad_norm": 0.236328125, "learning_rate": 0.0002, "loss": 1.7334, "step": 1628 }, { "epoch": 0.5112793754168465, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.8781, "step": 1629 }, { "epoch": 0.5115932362980109, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.7577, "step": 1630 }, { "epoch": 0.5119070971791754, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 2.245, "step": 1631 }, { "epoch": 0.5122209580603397, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.851, "step": 1632 }, { "epoch": 0.5125348189415042, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 1.8517, "step": 1633 }, { "epoch": 0.5128486798226686, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 2.6865, "step": 1634 }, { "epoch": 0.513162540703833, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 2.5321, "step": 1635 }, { "epoch": 0.5134764015849974, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 2.2039, "step": 1636 }, { "epoch": 0.5137902624661619, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 2.0997, "step": 1637 }, { "epoch": 0.5141041233473262, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 2.5026, "step": 1638 }, { "epoch": 0.5144179842284907, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 2.027, "step": 1639 }, { "epoch": 0.5147318451096552, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.9668, "step": 1640 }, { "epoch": 0.5150457059908196, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 2.2957, "step": 1641 }, { "epoch": 0.515359566871984, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 2.3475, "step": 1642 }, { "epoch": 0.5156734277531484, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 2.2628, "step": 1643 }, { "epoch": 0.5159872886343129, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 2.5057, "step": 1644 }, { "epoch": 0.5163011495154772, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 1.9935, "step": 1645 }, { "epoch": 0.5166150103966417, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 2.4644, "step": 1646 }, { "epoch": 0.5169288712778061, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 2.177, "step": 1647 }, { "epoch": 0.5172427321589705, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 2.3228, "step": 1648 }, { "epoch": 0.517556593040135, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 2.629, "step": 1649 }, { "epoch": 0.5178704539212994, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 2.8592, "step": 1650 }, { "epoch": 0.5181843148024639, "grad_norm": 0.07763671875, "learning_rate": 0.0002, "loss": 1.34, "step": 1651 }, { "epoch": 0.5184981756836282, "grad_norm": 0.1083984375, "learning_rate": 0.0002, "loss": 1.4626, "step": 1652 }, { "epoch": 0.5188120365647927, "grad_norm": 0.11865234375, "learning_rate": 0.0002, "loss": 1.5062, "step": 1653 }, { "epoch": 0.519125897445957, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.4513, "step": 1654 }, { "epoch": 0.5194397583271215, "grad_norm": 0.12109375, "learning_rate": 0.0002, "loss": 1.2568, "step": 1655 }, { "epoch": 0.5197536192082859, "grad_norm": 0.1064453125, "learning_rate": 0.0002, "loss": 1.408, "step": 1656 }, { "epoch": 0.5200674800894504, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.2524, "step": 1657 }, { "epoch": 0.5203813409706147, "grad_norm": 0.11328125, "learning_rate": 0.0002, "loss": 1.3407, "step": 1658 }, { "epoch": 0.5206952018517792, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.4142, "step": 1659 }, { "epoch": 0.5210090627329437, "grad_norm": 0.12451171875, "learning_rate": 0.0002, "loss": 1.4058, "step": 1660 }, { "epoch": 0.521322923614108, "grad_norm": 0.11767578125, "learning_rate": 0.0002, "loss": 1.3903, "step": 1661 }, { "epoch": 0.5216367844952725, "grad_norm": 0.115234375, "learning_rate": 0.0002, "loss": 1.3566, "step": 1662 }, { "epoch": 0.5219506453764369, "grad_norm": 0.12158203125, "learning_rate": 0.0002, "loss": 1.4786, "step": 1663 }, { "epoch": 0.5222645062576013, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.5583, "step": 1664 }, { "epoch": 0.5225783671387657, "grad_norm": 0.1279296875, "learning_rate": 0.0002, "loss": 1.4332, "step": 1665 }, { "epoch": 0.5228922280199302, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5549, "step": 1666 }, { "epoch": 0.5232060889010945, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.544, "step": 1667 }, { "epoch": 0.523519949782259, "grad_norm": 0.1298828125, "learning_rate": 0.0002, "loss": 1.5544, "step": 1668 }, { "epoch": 0.5238338106634234, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 1.8305, "step": 1669 }, { "epoch": 0.5241476715445879, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.4007, "step": 1670 }, { "epoch": 0.5244615324257523, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6236, "step": 1671 }, { "epoch": 0.5247753933069167, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.7004, "step": 1672 }, { "epoch": 0.5250892541880812, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5039, "step": 1673 }, { "epoch": 0.5254031150692455, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5396, "step": 1674 }, { "epoch": 0.52571697595041, "grad_norm": 0.193359375, "learning_rate": 0.0002, "loss": 1.6841, "step": 1675 }, { "epoch": 0.5260308368315744, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.3799, "step": 1676 }, { "epoch": 0.5263446977127388, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.3486, "step": 1677 }, { "epoch": 0.5266585585939032, "grad_norm": 0.2041015625, "learning_rate": 0.0002, "loss": 1.7302, "step": 1678 }, { "epoch": 0.5269724194750677, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 1.9683, "step": 1679 }, { "epoch": 0.5272862803562322, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 2.2732, "step": 1680 }, { "epoch": 0.5276001412373965, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 2.1772, "step": 1681 }, { "epoch": 0.527914002118561, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.9295, "step": 1682 }, { "epoch": 0.5282278629997254, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.9274, "step": 1683 }, { "epoch": 0.5282278629997254, "eval_loss": 1.7762224674224854, "eval_runtime": 123.1506, "eval_samples_per_second": 8.12, "eval_steps_per_second": 8.12, "step": 1683 }, { "epoch": 0.5282278629997254, "mmlu_eval_accuracy": 0.438688172318441, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.4375, "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, "mmlu_eval_accuracy_clinical_knowledge": 0.3448275862068966, "mmlu_eval_accuracy_college_biology": 0.25, "mmlu_eval_accuracy_college_chemistry": 0.25, "mmlu_eval_accuracy_college_computer_science": 0.36363636363636365, "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.45454545454545453, "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, "mmlu_eval_accuracy_econometrics": 0.25, "mmlu_eval_accuracy_electrical_engineering": 0.5625, "mmlu_eval_accuracy_elementary_mathematics": 0.3902439024390244, "mmlu_eval_accuracy_formal_logic": 0.35714285714285715, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.3125, "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556, "mmlu_eval_accuracy_high_school_european_history": 0.6666666666666666, "mmlu_eval_accuracy_high_school_geography": 0.5, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5714285714285714, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3488372093023256, "mmlu_eval_accuracy_high_school_mathematics": 0.27586206896551724, "mmlu_eval_accuracy_high_school_microeconomics": 0.23076923076923078, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.65, "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, "mmlu_eval_accuracy_high_school_us_history": 0.5454545454545454, "mmlu_eval_accuracy_high_school_world_history": 0.5384615384615384, "mmlu_eval_accuracy_human_aging": 0.5652173913043478, "mmlu_eval_accuracy_human_sexuality": 0.5833333333333334, "mmlu_eval_accuracy_international_law": 0.7692307692307693, "mmlu_eval_accuracy_jurisprudence": 0.45454545454545453, "mmlu_eval_accuracy_logical_fallacies": 0.5555555555555556, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.45454545454545453, "mmlu_eval_accuracy_marketing": 0.68, "mmlu_eval_accuracy_medical_genetics": 0.8181818181818182, "mmlu_eval_accuracy_miscellaneous": 0.5930232558139535, "mmlu_eval_accuracy_moral_disputes": 0.5526315789473685, "mmlu_eval_accuracy_moral_scenarios": 0.23, "mmlu_eval_accuracy_nutrition": 0.5151515151515151, "mmlu_eval_accuracy_philosophy": 0.47058823529411764, "mmlu_eval_accuracy_prehistory": 0.3142857142857143, "mmlu_eval_accuracy_professional_accounting": 0.25806451612903225, "mmlu_eval_accuracy_professional_law": 0.37058823529411766, "mmlu_eval_accuracy_professional_medicine": 0.3870967741935484, "mmlu_eval_accuracy_professional_psychology": 0.3188405797101449, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.4444444444444444, "mmlu_eval_accuracy_sociology": 0.5909090909090909, "mmlu_eval_accuracy_us_foreign_policy": 0.5454545454545454, "mmlu_eval_accuracy_virology": 0.3888888888888889, "mmlu_eval_accuracy_world_religions": 0.631578947368421, "mmlu_loss": 1.4215357886623667, "step": 1683 }, { "epoch": 0.5285417238808898, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 2.0894, "step": 1684 }, { "epoch": 0.5288555847620542, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 2.2266, "step": 1685 }, { "epoch": 0.5291694456432187, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 2.0832, "step": 1686 }, { "epoch": 0.529483306524383, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 2.0019, "step": 1687 }, { "epoch": 0.5297971674055475, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 2.0644, "step": 1688 }, { "epoch": 0.5301110282867119, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 2.1444, "step": 1689 }, { "epoch": 0.5304248891678763, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 2.3479, "step": 1690 }, { "epoch": 0.5307387500490408, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 2.8993, "step": 1691 }, { "epoch": 0.5310526109302052, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 2.1527, "step": 1692 }, { "epoch": 0.5313664718113696, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 2.3942, "step": 1693 }, { "epoch": 0.531680332692534, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 2.4652, "step": 1694 }, { "epoch": 0.5319941935736985, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.5987, "step": 1695 }, { "epoch": 0.5323080544548628, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 1.9468, "step": 1696 }, { "epoch": 0.5326219153360273, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 1.8692, "step": 1697 }, { "epoch": 0.5329357762171917, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 2.0162, "step": 1698 }, { "epoch": 0.5332496370983562, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 2.8707, "step": 1699 }, { "epoch": 0.5335634979795206, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 2.8278, "step": 1700 }, { "epoch": 0.533877358860685, "grad_norm": 0.08935546875, "learning_rate": 0.0002, "loss": 1.4067, "step": 1701 }, { "epoch": 0.5341912197418495, "grad_norm": 0.10400390625, "learning_rate": 0.0002, "loss": 1.4477, "step": 1702 }, { "epoch": 0.5345050806230138, "grad_norm": 0.11181640625, "learning_rate": 0.0002, "loss": 1.4612, "step": 1703 }, { "epoch": 0.5348189415041783, "grad_norm": 0.1181640625, "learning_rate": 0.0002, "loss": 1.5085, "step": 1704 }, { "epoch": 0.5351328023853427, "grad_norm": 0.11865234375, "learning_rate": 0.0002, "loss": 1.3591, "step": 1705 }, { "epoch": 0.5354466632665071, "grad_norm": 0.12353515625, "learning_rate": 0.0002, "loss": 1.3802, "step": 1706 }, { "epoch": 0.5357605241476715, "grad_norm": 0.11767578125, "learning_rate": 0.0002, "loss": 1.468, "step": 1707 }, { "epoch": 0.536074385028836, "grad_norm": 0.11474609375, "learning_rate": 0.0002, "loss": 1.4686, "step": 1708 }, { "epoch": 0.5363882459100003, "grad_norm": 0.12451171875, "learning_rate": 0.0002, "loss": 1.3237, "step": 1709 }, { "epoch": 0.5367021067911648, "grad_norm": 0.119140625, "learning_rate": 0.0002, "loss": 1.4139, "step": 1710 }, { "epoch": 0.5370159676723293, "grad_norm": 0.1201171875, "learning_rate": 0.0002, "loss": 1.3979, "step": 1711 }, { "epoch": 0.5373298285534936, "grad_norm": 0.125, "learning_rate": 0.0002, "loss": 1.3369, "step": 1712 }, { "epoch": 0.5376436894346581, "grad_norm": 0.126953125, "learning_rate": 0.0002, "loss": 1.289, "step": 1713 }, { "epoch": 0.5379575503158225, "grad_norm": 0.1279296875, "learning_rate": 0.0002, "loss": 1.361, "step": 1714 }, { "epoch": 0.538271411196987, "grad_norm": 0.12109375, "learning_rate": 0.0002, "loss": 1.3253, "step": 1715 }, { "epoch": 0.5385852720781513, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5354, "step": 1716 }, { "epoch": 0.5388991329593158, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5638, "step": 1717 }, { "epoch": 0.5392129938404802, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.481, "step": 1718 }, { "epoch": 0.5395268547216446, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.3881, "step": 1719 }, { "epoch": 0.5398407156028091, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.6546, "step": 1720 }, { "epoch": 0.5401545764839735, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.3408, "step": 1721 }, { "epoch": 0.5404684373651379, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5049, "step": 1722 }, { "epoch": 0.5407822982463023, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.4163, "step": 1723 }, { "epoch": 0.5410961591274668, "grad_norm": 0.197265625, "learning_rate": 0.0002, "loss": 1.7485, "step": 1724 }, { "epoch": 0.5414100200086311, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.4358, "step": 1725 }, { "epoch": 0.5417238808897956, "grad_norm": 0.2158203125, "learning_rate": 0.0002, "loss": 1.7268, "step": 1726 }, { "epoch": 0.54203774177096, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.4654, "step": 1727 }, { "epoch": 0.5423516026521245, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 2.0668, "step": 1728 }, { "epoch": 0.5426654635332888, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 2.0296, "step": 1729 }, { "epoch": 0.5429793244144533, "grad_norm": 0.2470703125, "learning_rate": 0.0002, "loss": 1.8383, "step": 1730 }, { "epoch": 0.5432931852956178, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.6799, "step": 1731 }, { "epoch": 0.5436070461767821, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.9518, "step": 1732 }, { "epoch": 0.5439209070579466, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 2.1723, "step": 1733 }, { "epoch": 0.544234767939111, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.9347, "step": 1734 }, { "epoch": 0.5445486288202754, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 2.0049, "step": 1735 }, { "epoch": 0.5448624897014398, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 2.1533, "step": 1736 }, { "epoch": 0.5451763505826043, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.9575, "step": 1737 }, { "epoch": 0.5454902114637686, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 2.0899, "step": 1738 }, { "epoch": 0.5458040723449331, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 2.5263, "step": 1739 }, { "epoch": 0.5461179332260976, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 2.1424, "step": 1740 }, { "epoch": 0.546431794107262, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 2.0427, "step": 1741 }, { "epoch": 0.5467456549884264, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 1.9787, "step": 1742 }, { "epoch": 0.5470595158695908, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 1.9796, "step": 1743 }, { "epoch": 0.5473733767507553, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.8059, "step": 1744 }, { "epoch": 0.5476872376319196, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 2.2813, "step": 1745 }, { "epoch": 0.5480010985130841, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 2.2681, "step": 1746 }, { "epoch": 0.5483149593942485, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 2.3991, "step": 1747 }, { "epoch": 0.5486288202754129, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 2.5695, "step": 1748 }, { "epoch": 0.5489426811565773, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 2.3254, "step": 1749 }, { "epoch": 0.5492565420377418, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 2.7664, "step": 1750 }, { "epoch": 0.5495704029189062, "grad_norm": 0.078125, "learning_rate": 0.0002, "loss": 1.4485, "step": 1751 }, { "epoch": 0.5498842638000706, "grad_norm": 0.10009765625, "learning_rate": 0.0002, "loss": 1.4486, "step": 1752 }, { "epoch": 0.5501981246812351, "grad_norm": 0.10888671875, "learning_rate": 0.0002, "loss": 1.4205, "step": 1753 }, { "epoch": 0.5505119855623994, "grad_norm": 0.11962890625, "learning_rate": 0.0002, "loss": 1.4582, "step": 1754 }, { "epoch": 0.5508258464435639, "grad_norm": 0.12890625, "learning_rate": 0.0002, "loss": 1.503, "step": 1755 }, { "epoch": 0.5511397073247283, "grad_norm": 0.125, "learning_rate": 0.0002, "loss": 1.4567, "step": 1756 }, { "epoch": 0.5514535682058928, "grad_norm": 0.1123046875, "learning_rate": 0.0002, "loss": 1.3373, "step": 1757 }, { "epoch": 0.5517674290870571, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.4963, "step": 1758 }, { "epoch": 0.5520812899682216, "grad_norm": 0.11376953125, "learning_rate": 0.0002, "loss": 1.3178, "step": 1759 }, { "epoch": 0.5523951508493861, "grad_norm": 0.1279296875, "learning_rate": 0.0002, "loss": 1.4644, "step": 1760 }, { "epoch": 0.5527090117305504, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5167, "step": 1761 }, { "epoch": 0.5530228726117149, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.3299, "step": 1762 }, { "epoch": 0.5533367334928793, "grad_norm": 0.1259765625, "learning_rate": 0.0002, "loss": 1.2503, "step": 1763 }, { "epoch": 0.5536505943740437, "grad_norm": 0.12890625, "learning_rate": 0.0002, "loss": 1.4346, "step": 1764 }, { "epoch": 0.5539644552552081, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.3315, "step": 1765 }, { "epoch": 0.5542783161363726, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.3975, "step": 1766 }, { "epoch": 0.5545921770175369, "grad_norm": 0.12890625, "learning_rate": 0.0002, "loss": 1.1707, "step": 1767 }, { "epoch": 0.5549060378987014, "grad_norm": 0.2265625, "learning_rate": 0.0002, "loss": 1.6337, "step": 1768 }, { "epoch": 0.5552198987798658, "grad_norm": 0.23828125, "learning_rate": 0.0002, "loss": 1.4859, "step": 1769 }, { "epoch": 0.5555337596610302, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.521, "step": 1770 }, { "epoch": 0.5558476205421947, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.4928, "step": 1771 }, { "epoch": 0.5561614814233591, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.5976, "step": 1772 }, { "epoch": 0.5564753423045236, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.4979, "step": 1773 }, { "epoch": 0.5567892031856879, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 1.5024, "step": 1774 }, { "epoch": 0.5571030640668524, "grad_norm": 0.2109375, "learning_rate": 0.0002, "loss": 1.412, "step": 1775 }, { "epoch": 0.5574169249480168, "grad_norm": 0.19140625, "learning_rate": 0.0002, "loss": 1.5321, "step": 1776 }, { "epoch": 0.5577307858291812, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.7385, "step": 1777 }, { "epoch": 0.5580446467103456, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.7201, "step": 1778 }, { "epoch": 0.5583585075915101, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.9139, "step": 1779 }, { "epoch": 0.5586723684726745, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 2.0861, "step": 1780 }, { "epoch": 0.5589862293538389, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 2.155, "step": 1781 }, { "epoch": 0.5593000902350034, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 2.086, "step": 1782 }, { "epoch": 0.5596139511161677, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.937, "step": 1783 }, { "epoch": 0.5599278119973322, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 2.3259, "step": 1784 }, { "epoch": 0.5602416728784966, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 2.1166, "step": 1785 }, { "epoch": 0.560555533759661, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 2.0513, "step": 1786 }, { "epoch": 0.5608693946408254, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 2.4344, "step": 1787 }, { "epoch": 0.5611832555219899, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 2.3386, "step": 1788 }, { "epoch": 0.5614971164031543, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 2.6012, "step": 1789 }, { "epoch": 0.5618109772843187, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 2.0929, "step": 1790 }, { "epoch": 0.5621248381654832, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 2.3641, "step": 1791 }, { "epoch": 0.5624386990466476, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 2.5071, "step": 1792 }, { "epoch": 0.562752559927812, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 2.3544, "step": 1793 }, { "epoch": 0.5630664208089764, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 2.0547, "step": 1794 }, { "epoch": 0.5633802816901409, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 2.2086, "step": 1795 }, { "epoch": 0.5636941425713052, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 1.958, "step": 1796 }, { "epoch": 0.5640080034524697, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 2.3892, "step": 1797 }, { "epoch": 0.5643218643336341, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 2.1997, "step": 1798 }, { "epoch": 0.5646357252147985, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 2.2381, "step": 1799 }, { "epoch": 0.564949586095963, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 2.6629, "step": 1800 }, { "epoch": 0.5652634469771274, "grad_norm": 0.07470703125, "learning_rate": 0.0002, "loss": 1.3331, "step": 1801 }, { "epoch": 0.5655773078582919, "grad_norm": 0.095703125, "learning_rate": 0.0002, "loss": 1.4385, "step": 1802 }, { "epoch": 0.5658911687394562, "grad_norm": 0.11962890625, "learning_rate": 0.0002, "loss": 1.4304, "step": 1803 }, { "epoch": 0.5662050296206207, "grad_norm": 0.130859375, "learning_rate": 0.0002, "loss": 1.4488, "step": 1804 }, { "epoch": 0.566518890501785, "grad_norm": 0.11767578125, "learning_rate": 0.0002, "loss": 1.4353, "step": 1805 }, { "epoch": 0.5668327513829495, "grad_norm": 0.11767578125, "learning_rate": 0.0002, "loss": 1.4794, "step": 1806 }, { "epoch": 0.5671466122641139, "grad_norm": 0.11376953125, "learning_rate": 0.0002, "loss": 1.4254, "step": 1807 }, { "epoch": 0.5674604731452784, "grad_norm": 0.126953125, "learning_rate": 0.0002, "loss": 1.5094, "step": 1808 }, { "epoch": 0.5677743340264427, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.4609, "step": 1809 }, { "epoch": 0.5680881949076072, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.1764, "step": 1810 }, { "epoch": 0.5684020557887717, "grad_norm": 0.115234375, "learning_rate": 0.0002, "loss": 1.3245, "step": 1811 }, { "epoch": 0.568715916669936, "grad_norm": 0.1259765625, "learning_rate": 0.0002, "loss": 1.4325, "step": 1812 }, { "epoch": 0.5690297775511005, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.4363, "step": 1813 }, { "epoch": 0.5693436384322649, "grad_norm": 0.12451171875, "learning_rate": 0.0002, "loss": 1.4602, "step": 1814 }, { "epoch": 0.5696574993134293, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.2947, "step": 1815 }, { "epoch": 0.5699713601945937, "grad_norm": 0.189453125, "learning_rate": 0.0002, "loss": 1.4927, "step": 1816 }, { "epoch": 0.5702852210757582, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.4334, "step": 1817 }, { "epoch": 0.5705990819569225, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.3099, "step": 1818 }, { "epoch": 0.570912942838087, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.409, "step": 1819 }, { "epoch": 0.5712268037192515, "grad_norm": 0.2236328125, "learning_rate": 0.0002, "loss": 1.5078, "step": 1820 }, { "epoch": 0.5715406646004159, "grad_norm": 0.2060546875, "learning_rate": 0.0002, "loss": 1.6973, "step": 1821 }, { "epoch": 0.5718545254815803, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.5313, "step": 1822 }, { "epoch": 0.5721683863627447, "grad_norm": 0.1982421875, "learning_rate": 0.0002, "loss": 1.4193, "step": 1823 }, { "epoch": 0.5724822472439092, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.6857, "step": 1824 }, { "epoch": 0.5727961081250735, "grad_norm": 0.232421875, "learning_rate": 0.0002, "loss": 1.7664, "step": 1825 }, { "epoch": 0.573109969006238, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.6576, "step": 1826 }, { "epoch": 0.5734238298874024, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 2.1437, "step": 1827 }, { "epoch": 0.5737376907685668, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 2.103, "step": 1828 }, { "epoch": 0.5740515516497312, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.8686, "step": 1829 }, { "epoch": 0.5743654125308957, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.706, "step": 1830 }, { "epoch": 0.5746792734120602, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.7805, "step": 1831 }, { "epoch": 0.5749931342932245, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 2.52, "step": 1832 }, { "epoch": 0.575306995174389, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.8605, "step": 1833 }, { "epoch": 0.5756208560555534, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 2.0258, "step": 1834 }, { "epoch": 0.5759347169367178, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 2.0296, "step": 1835 }, { "epoch": 0.5762485778178822, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 2.0942, "step": 1836 }, { "epoch": 0.5765624386990467, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 2.1924, "step": 1837 }, { "epoch": 0.576876299580211, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 2.4209, "step": 1838 }, { "epoch": 0.5771901604613755, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 1.9007, "step": 1839 }, { "epoch": 0.5775040213425399, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 3.0011, "step": 1840 }, { "epoch": 0.5778178822237043, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 1.9979, "step": 1841 }, { "epoch": 0.5781317431048688, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 2.0207, "step": 1842 }, { "epoch": 0.5784456039860332, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 1.9656, "step": 1843 }, { "epoch": 0.5787594648671976, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 2.6595, "step": 1844 }, { "epoch": 0.579073325748362, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 2.6086, "step": 1845 }, { "epoch": 0.5793871866295265, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 2.3001, "step": 1846 }, { "epoch": 0.5797010475106908, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.8254, "step": 1847 }, { "epoch": 0.5800149083918553, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 2.1071, "step": 1848 }, { "epoch": 0.5803287692730197, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 2.1997, "step": 1849 }, { "epoch": 0.5806426301541842, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 2.7012, "step": 1850 }, { "epoch": 0.5809564910353486, "grad_norm": 0.10302734375, "learning_rate": 0.0002, "loss": 1.3707, "step": 1851 }, { "epoch": 0.581270351916513, "grad_norm": 0.107421875, "learning_rate": 0.0002, "loss": 1.3206, "step": 1852 }, { "epoch": 0.5815842127976775, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5333, "step": 1853 }, { "epoch": 0.5818980736788418, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.6303, "step": 1854 }, { "epoch": 0.5822119345600063, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.4587, "step": 1855 }, { "epoch": 0.5825257954411707, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.4127, "step": 1856 }, { "epoch": 0.5828396563223351, "grad_norm": 0.1923828125, "learning_rate": 0.0002, "loss": 1.3319, "step": 1857 }, { "epoch": 0.5831535172034995, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.4086, "step": 1858 }, { "epoch": 0.583467378084664, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.4039, "step": 1859 }, { "epoch": 0.5837812389658283, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.4437, "step": 1860 }, { "epoch": 0.5840950998469928, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.4793, "step": 1861 }, { "epoch": 0.5844089607281573, "grad_norm": 0.2314453125, "learning_rate": 0.0002, "loss": 1.5648, "step": 1862 }, { "epoch": 0.5847228216093217, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.5329, "step": 1863 }, { "epoch": 0.5850366824904861, "grad_norm": 0.1875, "learning_rate": 0.0002, "loss": 1.5164, "step": 1864 }, { "epoch": 0.5853505433716505, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.592, "step": 1865 }, { "epoch": 0.585664404252815, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.4296, "step": 1866 }, { "epoch": 0.5859782651339793, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.4401, "step": 1867 }, { "epoch": 0.5862921260151438, "grad_norm": 0.2490234375, "learning_rate": 0.0002, "loss": 1.4334, "step": 1868 }, { "epoch": 0.5866059868963082, "grad_norm": 0.189453125, "learning_rate": 0.0002, "loss": 1.3723, "step": 1869 }, { "epoch": 0.5869198477774726, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.513, "step": 1870 }, { "epoch": 0.5869198477774726, "eval_loss": 1.7979594469070435, "eval_runtime": 123.3498, "eval_samples_per_second": 8.107, "eval_steps_per_second": 8.107, "step": 1870 }, { "epoch": 0.5869198477774726, "mmlu_eval_accuracy": 0.4160179039656003, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, "mmlu_eval_accuracy_clinical_knowledge": 0.3448275862068966, "mmlu_eval_accuracy_college_biology": 0.1875, "mmlu_eval_accuracy_college_chemistry": 0.0, "mmlu_eval_accuracy_college_computer_science": 0.45454545454545453, "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.4375, "mmlu_eval_accuracy_elementary_mathematics": 0.3902439024390244, "mmlu_eval_accuracy_formal_logic": 0.35714285714285715, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.3125, "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, "mmlu_eval_accuracy_high_school_european_history": 0.6666666666666666, "mmlu_eval_accuracy_high_school_geography": 0.5909090909090909, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5714285714285714, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3953488372093023, "mmlu_eval_accuracy_high_school_mathematics": 0.3103448275862069, "mmlu_eval_accuracy_high_school_microeconomics": 0.2692307692307692, "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, "mmlu_eval_accuracy_high_school_psychology": 0.5833333333333334, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.5909090909090909, "mmlu_eval_accuracy_high_school_world_history": 0.5769230769230769, "mmlu_eval_accuracy_human_aging": 0.6521739130434783, "mmlu_eval_accuracy_human_sexuality": 0.5833333333333334, "mmlu_eval_accuracy_international_law": 0.7692307692307693, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.6111111111111112, "mmlu_eval_accuracy_machine_learning": 0.18181818181818182, "mmlu_eval_accuracy_management": 0.45454545454545453, "mmlu_eval_accuracy_marketing": 0.72, "mmlu_eval_accuracy_medical_genetics": 0.6363636363636364, "mmlu_eval_accuracy_miscellaneous": 0.5930232558139535, "mmlu_eval_accuracy_moral_disputes": 0.39473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.24, "mmlu_eval_accuracy_nutrition": 0.42424242424242425, "mmlu_eval_accuracy_philosophy": 0.47058823529411764, "mmlu_eval_accuracy_prehistory": 0.4, "mmlu_eval_accuracy_professional_accounting": 0.45161290322580644, "mmlu_eval_accuracy_professional_law": 0.36470588235294116, "mmlu_eval_accuracy_professional_medicine": 0.3870967741935484, "mmlu_eval_accuracy_professional_psychology": 0.4057971014492754, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.5454545454545454, "mmlu_eval_accuracy_us_foreign_policy": 0.45454545454545453, "mmlu_eval_accuracy_virology": 0.4444444444444444, "mmlu_eval_accuracy_world_religions": 0.631578947368421, "mmlu_loss": 1.2989568939635678, "step": 1870 }, { "epoch": 0.5872337086586371, "grad_norm": 0.20703125, "learning_rate": 0.0002, "loss": 1.4714, "step": 1871 }, { "epoch": 0.5875475695398015, "grad_norm": 0.2265625, "learning_rate": 0.0002, "loss": 1.6044, "step": 1872 }, { "epoch": 0.587861430420966, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.7849, "step": 1873 }, { "epoch": 0.5881752913021303, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.8635, "step": 1874 }, { "epoch": 0.5884891521832948, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.7972, "step": 1875 }, { "epoch": 0.5888030130644591, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.7615, "step": 1876 }, { "epoch": 0.5891168739456236, "grad_norm": 0.232421875, "learning_rate": 0.0002, "loss": 1.6072, "step": 1877 }, { "epoch": 0.589430734826788, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.8632, "step": 1878 }, { "epoch": 0.5897445957079525, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 2.0262, "step": 1879 }, { "epoch": 0.5900584565891168, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 2.0907, "step": 1880 }, { "epoch": 0.5903723174702813, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 1.9217, "step": 1881 }, { "epoch": 0.5906861783514458, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 2.2323, "step": 1882 }, { "epoch": 0.5910000392326101, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 2.0756, "step": 1883 }, { "epoch": 0.5913139001137746, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 2.1437, "step": 1884 }, { "epoch": 0.591627760994939, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 2.1764, "step": 1885 }, { "epoch": 0.5919416218761034, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.9309, "step": 1886 }, { "epoch": 0.5922554827572678, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 2.2414, "step": 1887 }, { "epoch": 0.5925693436384323, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 2.5843, "step": 1888 }, { "epoch": 0.5928832045195966, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 1.8418, "step": 1889 }, { "epoch": 0.5931970654007611, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.9504, "step": 1890 }, { "epoch": 0.5935109262819256, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 2.0956, "step": 1891 }, { "epoch": 0.59382478716309, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 2.7819, "step": 1892 }, { "epoch": 0.5941386480442544, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 2.29, "step": 1893 }, { "epoch": 0.5944525089254188, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 1.695, "step": 1894 }, { "epoch": 0.5947663698065833, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 2.5746, "step": 1895 }, { "epoch": 0.5950802306877476, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 1.748, "step": 1896 }, { "epoch": 0.5953940915689121, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.7555, "step": 1897 }, { "epoch": 0.5957079524500765, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 2.1987, "step": 1898 }, { "epoch": 0.5960218133312409, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 2.6141, "step": 1899 }, { "epoch": 0.5963356742124053, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 3.0325, "step": 1900 }, { "epoch": 0.5966495350935698, "grad_norm": 0.09912109375, "learning_rate": 0.0002, "loss": 1.353, "step": 1901 }, { "epoch": 0.5969633959747342, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.4564, "step": 1902 }, { "epoch": 0.5972772568558986, "grad_norm": 0.103515625, "learning_rate": 0.0002, "loss": 1.3223, "step": 1903 }, { "epoch": 0.5975911177370631, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.3212, "step": 1904 }, { "epoch": 0.5979049786182274, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.4801, "step": 1905 }, { "epoch": 0.5982188394993919, "grad_norm": 0.11376953125, "learning_rate": 0.0002, "loss": 1.2425, "step": 1906 }, { "epoch": 0.5985327003805563, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5301, "step": 1907 }, { "epoch": 0.5988465612617208, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6526, "step": 1908 }, { "epoch": 0.5991604221428851, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.359, "step": 1909 }, { "epoch": 0.5994742830240496, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.3382, "step": 1910 }, { "epoch": 0.5997881439052141, "grad_norm": 0.1884765625, "learning_rate": 0.0002, "loss": 1.5714, "step": 1911 }, { "epoch": 0.6001020047863784, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.4322, "step": 1912 }, { "epoch": 0.6004158656675429, "grad_norm": 0.126953125, "learning_rate": 0.0002, "loss": 1.5023, "step": 1913 }, { "epoch": 0.6007297265487073, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.4193, "step": 1914 }, { "epoch": 0.6010435874298717, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.4606, "step": 1915 }, { "epoch": 0.6013574483110361, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.4069, "step": 1916 }, { "epoch": 0.6016713091922006, "grad_norm": 0.216796875, "learning_rate": 0.0002, "loss": 1.635, "step": 1917 }, { "epoch": 0.6019851700733649, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.4834, "step": 1918 }, { "epoch": 0.6022990309545294, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 1.4178, "step": 1919 }, { "epoch": 0.6026128918356938, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.4411, "step": 1920 }, { "epoch": 0.6029267527168582, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.5469, "step": 1921 }, { "epoch": 0.6032406135980227, "grad_norm": 0.1953125, "learning_rate": 0.0002, "loss": 1.3598, "step": 1922 }, { "epoch": 0.6035544744791871, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.4062, "step": 1923 }, { "epoch": 0.6038683353603516, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.469, "step": 1924 }, { "epoch": 0.6041821962415159, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 1.6936, "step": 1925 }, { "epoch": 0.6044960571226804, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 1.9837, "step": 1926 }, { "epoch": 0.6048099180038448, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.9, "step": 1927 }, { "epoch": 0.6051237788850092, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 2.4733, "step": 1928 }, { "epoch": 0.6054376397661736, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 1.5585, "step": 1929 }, { "epoch": 0.6057515006473381, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 1.9682, "step": 1930 }, { "epoch": 0.6060653615285025, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.7815, "step": 1931 }, { "epoch": 0.6063792224096669, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 1.9774, "step": 1932 }, { "epoch": 0.6066930832908314, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 2.0198, "step": 1933 }, { "epoch": 0.6070069441719957, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 1.9991, "step": 1934 }, { "epoch": 0.6073208050531602, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 2.0222, "step": 1935 }, { "epoch": 0.6076346659343246, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 2.1085, "step": 1936 }, { "epoch": 0.607948526815489, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 2.104, "step": 1937 }, { "epoch": 0.6082623876966534, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 2.2331, "step": 1938 }, { "epoch": 0.6085762485778179, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 2.3148, "step": 1939 }, { "epoch": 0.6088901094589823, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 2.8777, "step": 1940 }, { "epoch": 0.6092039703401467, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 2.0675, "step": 1941 }, { "epoch": 0.6095178312213112, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 2.0153, "step": 1942 }, { "epoch": 0.6098316921024756, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.746, "step": 1943 }, { "epoch": 0.61014555298364, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 2.3711, "step": 1944 }, { "epoch": 0.6104594138648044, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.7351, "step": 1945 }, { "epoch": 0.6107732747459689, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 1.8861, "step": 1946 }, { "epoch": 0.6110871356271332, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 1.8354, "step": 1947 }, { "epoch": 0.6114009965082977, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 2.2304, "step": 1948 }, { "epoch": 0.6117148573894621, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 2.6094, "step": 1949 }, { "epoch": 0.6120287182706265, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 2.7485, "step": 1950 }, { "epoch": 0.612342579151791, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5545, "step": 1951 }, { "epoch": 0.6126564400329554, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.4116, "step": 1952 }, { "epoch": 0.6129703009141199, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.3595, "step": 1953 }, { "epoch": 0.6132841617952842, "grad_norm": 0.1220703125, "learning_rate": 0.0002, "loss": 1.4558, "step": 1954 }, { "epoch": 0.6135980226764487, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.6147, "step": 1955 }, { "epoch": 0.6139118835576131, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.3496, "step": 1956 }, { "epoch": 0.6142257444387775, "grad_norm": 0.2060546875, "learning_rate": 0.0002, "loss": 1.4669, "step": 1957 }, { "epoch": 0.6145396053199419, "grad_norm": 0.119140625, "learning_rate": 0.0002, "loss": 1.4654, "step": 1958 }, { "epoch": 0.6148534662011064, "grad_norm": 0.12255859375, "learning_rate": 0.0002, "loss": 1.4288, "step": 1959 }, { "epoch": 0.6151673270822707, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.3818, "step": 1960 }, { "epoch": 0.6154811879634352, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.4344, "step": 1961 }, { "epoch": 0.6157950488445997, "grad_norm": 0.208984375, "learning_rate": 0.0002, "loss": 1.3964, "step": 1962 }, { "epoch": 0.616108909725764, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.5546, "step": 1963 }, { "epoch": 0.6164227706069285, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.5167, "step": 1964 }, { "epoch": 0.6167366314880929, "grad_norm": 0.12060546875, "learning_rate": 0.0002, "loss": 1.3906, "step": 1965 }, { "epoch": 0.6170504923692574, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.5305, "step": 1966 }, { "epoch": 0.6173643532504217, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 1.4886, "step": 1967 }, { "epoch": 0.6176782141315862, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.3195, "step": 1968 }, { "epoch": 0.6179920750127506, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.6338, "step": 1969 }, { "epoch": 0.618305935893915, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.1924, "step": 1970 }, { "epoch": 0.6186197967750795, "grad_norm": 0.91796875, "learning_rate": 0.0002, "loss": 1.8635, "step": 1971 }, { "epoch": 0.6189336576562439, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 1.705, "step": 1972 }, { "epoch": 0.6192475185374083, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 1.4895, "step": 1973 }, { "epoch": 0.6195613794185727, "grad_norm": 0.2119140625, "learning_rate": 0.0002, "loss": 1.4939, "step": 1974 }, { "epoch": 0.6198752402997372, "grad_norm": 0.20703125, "learning_rate": 0.0002, "loss": 1.551, "step": 1975 }, { "epoch": 0.6201891011809015, "grad_norm": 0.2470703125, "learning_rate": 0.0002, "loss": 1.6042, "step": 1976 }, { "epoch": 0.620502962062066, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.6444, "step": 1977 }, { "epoch": 0.6208168229432304, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 2.2018, "step": 1978 }, { "epoch": 0.6211306838243948, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 1.7853, "step": 1979 }, { "epoch": 0.6214445447055592, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.7374, "step": 1980 }, { "epoch": 0.6217584055867237, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 1.6137, "step": 1981 }, { "epoch": 0.6220722664678882, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 2.2201, "step": 1982 }, { "epoch": 0.6223861273490525, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.8339, "step": 1983 }, { "epoch": 0.622699988230217, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 2.3659, "step": 1984 }, { "epoch": 0.6230138491113814, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 2.1963, "step": 1985 }, { "epoch": 0.6233277099925458, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 1.8056, "step": 1986 }, { "epoch": 0.6236415708737102, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 2.1867, "step": 1987 }, { "epoch": 0.6239554317548747, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 1.9651, "step": 1988 }, { "epoch": 0.624269292636039, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 2.3103, "step": 1989 }, { "epoch": 0.6245831535172035, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 2.2022, "step": 1990 }, { "epoch": 0.624897014398368, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 2.1621, "step": 1991 }, { "epoch": 0.6252108752795323, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 2.5333, "step": 1992 }, { "epoch": 0.6255247361606968, "grad_norm": 0.953125, "learning_rate": 0.0002, "loss": 2.9862, "step": 1993 }, { "epoch": 0.6258385970418612, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 2.0522, "step": 1994 }, { "epoch": 0.6261524579230257, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 1.937, "step": 1995 }, { "epoch": 0.62646631880419, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 2.407, "step": 1996 }, { "epoch": 0.6267801796853545, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 2.128, "step": 1997 }, { "epoch": 0.6270940405665189, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 2.4289, "step": 1998 }, { "epoch": 0.6274079014476833, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 2.9464, "step": 1999 }, { "epoch": 0.6277217623288477, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 2.3799, "step": 2000 }, { "epoch": 0.6280356232100122, "grad_norm": 0.0859375, "learning_rate": 0.0002, "loss": 1.3815, "step": 2001 }, { "epoch": 0.6283494840911766, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.4014, "step": 2002 }, { "epoch": 0.628663344972341, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.5265, "step": 2003 }, { "epoch": 0.6289772058535055, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.524, "step": 2004 }, { "epoch": 0.6292910667346698, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.5699, "step": 2005 }, { "epoch": 0.6296049276158343, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.3892, "step": 2006 }, { "epoch": 0.6299187884969987, "grad_norm": 0.1328125, "learning_rate": 0.0002, "loss": 1.4151, "step": 2007 }, { "epoch": 0.6302326493781631, "grad_norm": 0.1845703125, "learning_rate": 0.0002, "loss": 1.4622, "step": 2008 }, { "epoch": 0.6305465102593275, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.4662, "step": 2009 }, { "epoch": 0.630860371140492, "grad_norm": 0.111328125, "learning_rate": 0.0002, "loss": 1.4608, "step": 2010 }, { "epoch": 0.6311742320216565, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 1.4231, "step": 2011 }, { "epoch": 0.6314880929028208, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.4065, "step": 2012 }, { "epoch": 0.6318019537839853, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 1.683, "step": 2013 }, { "epoch": 0.6321158146651497, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 1.4582, "step": 2014 }, { "epoch": 0.6324296755463141, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 1.389, "step": 2015 }, { "epoch": 0.6327435364274785, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.2963, "step": 2016 }, { "epoch": 0.633057397308643, "grad_norm": 0.2236328125, "learning_rate": 0.0002, "loss": 1.6131, "step": 2017 }, { "epoch": 0.6333712581898073, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.4931, "step": 2018 }, { "epoch": 0.6336851190709718, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 1.6715, "step": 2019 }, { "epoch": 0.6339989799521362, "grad_norm": 0.212890625, "learning_rate": 0.0002, "loss": 1.4375, "step": 2020 }, { "epoch": 0.6343128408333006, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.6019, "step": 2021 }, { "epoch": 0.6346267017144651, "grad_norm": 0.2119140625, "learning_rate": 0.0002, "loss": 1.37, "step": 2022 }, { "epoch": 0.6349405625956295, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.5931, "step": 2023 }, { "epoch": 0.635254423476794, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.7223, "step": 2024 }, { "epoch": 0.6355682843579583, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.6626, "step": 2025 }, { "epoch": 0.6358821452391228, "grad_norm": 0.1923828125, "learning_rate": 0.0002, "loss": 1.4276, "step": 2026 }, { "epoch": 0.6361960061202871, "grad_norm": 0.2265625, "learning_rate": 0.0002, "loss": 1.7796, "step": 2027 }, { "epoch": 0.6365098670014516, "grad_norm": 0.2021484375, "learning_rate": 0.0002, "loss": 1.6912, "step": 2028 }, { "epoch": 0.636823727882616, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.8183, "step": 2029 }, { "epoch": 0.6371375887637805, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.7979, "step": 2030 }, { "epoch": 0.6374514496449448, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 1.9475, "step": 2031 }, { "epoch": 0.6377653105261093, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 1.9001, "step": 2032 }, { "epoch": 0.6380791714072738, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.8966, "step": 2033 }, { "epoch": 0.6383930322884381, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 2.4315, "step": 2034 }, { "epoch": 0.6387068931696026, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 2.07, "step": 2035 }, { "epoch": 0.639020754050767, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 2.5365, "step": 2036 }, { "epoch": 0.6393346149319314, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 2.3692, "step": 2037 }, { "epoch": 0.6396484758130958, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 1.9882, "step": 2038 }, { "epoch": 0.6399623366942603, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 2.08, "step": 2039 }, { "epoch": 0.6402761975754246, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 2.1133, "step": 2040 }, { "epoch": 0.6405900584565891, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 2.3706, "step": 2041 }, { "epoch": 0.6409039193377536, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 2.2254, "step": 2042 }, { "epoch": 0.641217780218918, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 2.3317, "step": 2043 }, { "epoch": 0.6415316411000824, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 2.5359, "step": 2044 }, { "epoch": 0.6418455019812468, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 2.541, "step": 2045 }, { "epoch": 0.6421593628624113, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 2.2715, "step": 2046 }, { "epoch": 0.6424732237435756, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 1.9943, "step": 2047 }, { "epoch": 0.6427870846247401, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 2.1426, "step": 2048 }, { "epoch": 0.6431009455059045, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 2.2063, "step": 2049 }, { "epoch": 0.6434148063870689, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 2.5465, "step": 2050 }, { "epoch": 0.6437286672682333, "grad_norm": 0.11474609375, "learning_rate": 0.0002, "loss": 1.5447, "step": 2051 }, { "epoch": 0.6440425281493978, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.4158, "step": 2052 }, { "epoch": 0.6443563890305622, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5656, "step": 2053 }, { "epoch": 0.6446702499117266, "grad_norm": 0.11279296875, "learning_rate": 0.0002, "loss": 1.5103, "step": 2054 }, { "epoch": 0.6449841107928911, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.3915, "step": 2055 }, { "epoch": 0.6452979716740554, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.3889, "step": 2056 }, { "epoch": 0.6456118325552199, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5592, "step": 2057 }, { "epoch": 0.6456118325552199, "eval_loss": 1.7869147062301636, "eval_runtime": 123.5793, "eval_samples_per_second": 8.092, "eval_steps_per_second": 8.092, "step": 2057 }, { "epoch": 0.6456118325552199, "mmlu_eval_accuracy": 0.4170033028018425, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.35714285714285715, "mmlu_eval_accuracy_astronomy": 0.4375, "mmlu_eval_accuracy_business_ethics": 0.7272727272727273, "mmlu_eval_accuracy_clinical_knowledge": 0.41379310344827586, "mmlu_eval_accuracy_college_biology": 0.1875, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.45454545454545453, "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.2727272727272727, "mmlu_eval_accuracy_conceptual_physics": 0.34615384615384615, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.5, "mmlu_eval_accuracy_elementary_mathematics": 0.3902439024390244, "mmlu_eval_accuracy_formal_logic": 0.2857142857142857, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.28125, "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365, "mmlu_eval_accuracy_high_school_computer_science": 0.4444444444444444, "mmlu_eval_accuracy_high_school_european_history": 0.7222222222222222, "mmlu_eval_accuracy_high_school_geography": 0.5454545454545454, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5714285714285714, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3023255813953488, "mmlu_eval_accuracy_high_school_mathematics": 0.27586206896551724, "mmlu_eval_accuracy_high_school_microeconomics": 0.23076923076923078, "mmlu_eval_accuracy_high_school_physics": 0.29411764705882354, "mmlu_eval_accuracy_high_school_psychology": 0.5333333333333333, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.5454545454545454, "mmlu_eval_accuracy_high_school_world_history": 0.5384615384615384, "mmlu_eval_accuracy_human_aging": 0.6521739130434783, "mmlu_eval_accuracy_human_sexuality": 0.5, "mmlu_eval_accuracy_international_law": 0.6923076923076923, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.6111111111111112, "mmlu_eval_accuracy_machine_learning": 0.18181818181818182, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.8, "mmlu_eval_accuracy_medical_genetics": 0.8181818181818182, "mmlu_eval_accuracy_miscellaneous": 0.5813953488372093, "mmlu_eval_accuracy_moral_disputes": 0.34210526315789475, "mmlu_eval_accuracy_moral_scenarios": 0.23, "mmlu_eval_accuracy_nutrition": 0.5151515151515151, "mmlu_eval_accuracy_philosophy": 0.5, "mmlu_eval_accuracy_prehistory": 0.34285714285714286, "mmlu_eval_accuracy_professional_accounting": 0.3548387096774194, "mmlu_eval_accuracy_professional_law": 0.3588235294117647, "mmlu_eval_accuracy_professional_medicine": 0.45161290322580644, "mmlu_eval_accuracy_professional_psychology": 0.4057971014492754, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.4074074074074074, "mmlu_eval_accuracy_sociology": 0.5454545454545454, "mmlu_eval_accuracy_us_foreign_policy": 0.45454545454545453, "mmlu_eval_accuracy_virology": 0.5555555555555556, "mmlu_eval_accuracy_world_religions": 0.631578947368421, "mmlu_loss": 0.9896279288167505, "step": 2057 }, { "epoch": 0.6459256934363843, "grad_norm": 0.1328125, "learning_rate": 0.0002, "loss": 1.5043, "step": 2058 }, { "epoch": 0.6462395543175488, "grad_norm": 0.12890625, "learning_rate": 0.0002, "loss": 1.4262, "step": 2059 }, { "epoch": 0.6465534151987131, "grad_norm": 0.208984375, "learning_rate": 0.0002, "loss": 1.5209, "step": 2060 }, { "epoch": 0.6468672760798776, "grad_norm": 0.11669921875, "learning_rate": 0.0002, "loss": 1.3202, "step": 2061 }, { "epoch": 0.6471811369610421, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.2827, "step": 2062 }, { "epoch": 0.6474949978422064, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.4121, "step": 2063 }, { "epoch": 0.6478088587233709, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.4375, "step": 2064 }, { "epoch": 0.6481227196045353, "grad_norm": 0.130859375, "learning_rate": 0.0002, "loss": 1.2448, "step": 2065 }, { "epoch": 0.6484365804856997, "grad_norm": 0.1328125, "learning_rate": 0.0002, "loss": 1.4004, "step": 2066 }, { "epoch": 0.6487504413668641, "grad_norm": 0.1318359375, "learning_rate": 0.0002, "loss": 1.3909, "step": 2067 }, { "epoch": 0.6490643022480286, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 1.3046, "step": 2068 }, { "epoch": 0.6493781631291929, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.3599, "step": 2069 }, { "epoch": 0.6496920240103574, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.3598, "step": 2070 }, { "epoch": 0.6500058848915218, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.8061, "step": 2071 }, { "epoch": 0.6503197457726863, "grad_norm": 0.228515625, "learning_rate": 0.0002, "loss": 1.3645, "step": 2072 }, { "epoch": 0.6506336066538507, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.607, "step": 2073 }, { "epoch": 0.6509474675350151, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.7126, "step": 2074 }, { "epoch": 0.6512613284161796, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.4475, "step": 2075 }, { "epoch": 0.6515751892973439, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.778, "step": 2076 }, { "epoch": 0.6518890501785084, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.6442, "step": 2077 }, { "epoch": 0.6522029110596728, "grad_norm": 0.224609375, "learning_rate": 0.0002, "loss": 1.5434, "step": 2078 }, { "epoch": 0.6525167719408372, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.7534, "step": 2079 }, { "epoch": 0.6528306328220016, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.7837, "step": 2080 }, { "epoch": 0.6531444937031661, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 2.125, "step": 2081 }, { "epoch": 0.6534583545843305, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 2.1823, "step": 2082 }, { "epoch": 0.6537722154654949, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 2.1828, "step": 2083 }, { "epoch": 0.6540860763466594, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 1.9816, "step": 2084 }, { "epoch": 0.6543999372278237, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 2.2713, "step": 2085 }, { "epoch": 0.6547137981089882, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 1.7811, "step": 2086 }, { "epoch": 0.6550276589901526, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 2.0191, "step": 2087 }, { "epoch": 0.6553415198713171, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 2.0255, "step": 2088 }, { "epoch": 0.6556553807524814, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.7514, "step": 2089 }, { "epoch": 0.6559692416336459, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 2.3259, "step": 2090 }, { "epoch": 0.6562831025148103, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 2.3979, "step": 2091 }, { "epoch": 0.6565969633959747, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 1.8578, "step": 2092 }, { "epoch": 0.6569108242771392, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 1.9541, "step": 2093 }, { "epoch": 0.6572246851583036, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 2.5271, "step": 2094 }, { "epoch": 0.657538546039468, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.8429, "step": 2095 }, { "epoch": 0.6578524069206324, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 1.8997, "step": 2096 }, { "epoch": 0.6581662678017969, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 2.293, "step": 2097 }, { "epoch": 0.6584801286829612, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 2.1048, "step": 2098 }, { "epoch": 0.6587939895641257, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 2.4086, "step": 2099 }, { "epoch": 0.6591078504452901, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 3.0144, "step": 2100 }, { "epoch": 0.6594217113264546, "grad_norm": 0.0849609375, "learning_rate": 0.0002, "loss": 1.3451, "step": 2101 }, { "epoch": 0.659735572207619, "grad_norm": 0.125, "learning_rate": 0.0002, "loss": 1.347, "step": 2102 }, { "epoch": 0.6600494330887834, "grad_norm": 0.11279296875, "learning_rate": 0.0002, "loss": 1.3807, "step": 2103 }, { "epoch": 0.6603632939699479, "grad_norm": 0.10498046875, "learning_rate": 0.0002, "loss": 1.4438, "step": 2104 }, { "epoch": 0.6606771548511122, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.404, "step": 2105 }, { "epoch": 0.6609910157322767, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.4676, "step": 2106 }, { "epoch": 0.6613048766134411, "grad_norm": 0.11962890625, "learning_rate": 0.0002, "loss": 1.2827, "step": 2107 }, { "epoch": 0.6616187374946055, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.5957, "step": 2108 }, { "epoch": 0.6619325983757699, "grad_norm": 0.193359375, "learning_rate": 0.0002, "loss": 1.487, "step": 2109 }, { "epoch": 0.6622464592569344, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.4979, "step": 2110 }, { "epoch": 0.6625603201380987, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.4709, "step": 2111 }, { "epoch": 0.6628741810192632, "grad_norm": 0.12109375, "learning_rate": 0.0002, "loss": 1.5848, "step": 2112 }, { "epoch": 0.6631880419004277, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5552, "step": 2113 }, { "epoch": 0.663501902781592, "grad_norm": 0.224609375, "learning_rate": 0.0002, "loss": 1.4407, "step": 2114 }, { "epoch": 0.6638157636627565, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.3967, "step": 2115 }, { "epoch": 0.6641296245439209, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.6367, "step": 2116 }, { "epoch": 0.6644434854250854, "grad_norm": 0.240234375, "learning_rate": 0.0002, "loss": 1.2283, "step": 2117 }, { "epoch": 0.6647573463062497, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 1.7342, "step": 2118 }, { "epoch": 0.6650712071874142, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.4274, "step": 2119 }, { "epoch": 0.6653850680685786, "grad_norm": 0.1875, "learning_rate": 0.0002, "loss": 1.466, "step": 2120 }, { "epoch": 0.665698928949743, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.4551, "step": 2121 }, { "epoch": 0.6660127898309075, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.6061, "step": 2122 }, { "epoch": 0.6663266507120719, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.4578, "step": 2123 }, { "epoch": 0.6666405115932363, "grad_norm": 0.240234375, "learning_rate": 0.0002, "loss": 1.7235, "step": 2124 }, { "epoch": 0.6669543724744007, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 1.6891, "step": 2125 }, { "epoch": 0.6672682333555652, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 1.894, "step": 2126 }, { "epoch": 0.6675820942367295, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.91, "step": 2127 }, { "epoch": 0.667895955117894, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 2.1388, "step": 2128 }, { "epoch": 0.6682098159990584, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.6798, "step": 2129 }, { "epoch": 0.6685236768802229, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.4976, "step": 2130 }, { "epoch": 0.6688375377613872, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.9372, "step": 2131 }, { "epoch": 0.6691513986425517, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 2.1857, "step": 2132 }, { "epoch": 0.6694652595237162, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.9852, "step": 2133 }, { "epoch": 0.6697791204048805, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 2.1061, "step": 2134 }, { "epoch": 0.670092981286045, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.8907, "step": 2135 }, { "epoch": 0.6704068421672094, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 2.0113, "step": 2136 }, { "epoch": 0.6707207030483738, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 2.0771, "step": 2137 }, { "epoch": 0.6710345639295382, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 2.6552, "step": 2138 }, { "epoch": 0.6713484248107027, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 2.2316, "step": 2139 }, { "epoch": 0.671662285691867, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 2.2031, "step": 2140 }, { "epoch": 0.6719761465730315, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 2.4282, "step": 2141 }, { "epoch": 0.672290007454196, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 2.2798, "step": 2142 }, { "epoch": 0.6726038683353603, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 1.9225, "step": 2143 }, { "epoch": 0.6729177292165248, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 2.0801, "step": 2144 }, { "epoch": 0.6732315900976892, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 2.1691, "step": 2145 }, { "epoch": 0.6735454509788537, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 2.5013, "step": 2146 }, { "epoch": 0.673859311860018, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 2.3714, "step": 2147 }, { "epoch": 0.6741731727411825, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 2.117, "step": 2148 }, { "epoch": 0.6744870336223469, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 2.5755, "step": 2149 }, { "epoch": 0.6748008945035113, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 3.2677, "step": 2150 }, { "epoch": 0.6751147553846757, "grad_norm": 0.10546875, "learning_rate": 0.0002, "loss": 1.1853, "step": 2151 }, { "epoch": 0.6754286162658402, "grad_norm": 0.09326171875, "learning_rate": 0.0002, "loss": 1.3131, "step": 2152 }, { "epoch": 0.6757424771470046, "grad_norm": 0.095703125, "learning_rate": 0.0002, "loss": 1.2502, "step": 2153 }, { "epoch": 0.676056338028169, "grad_norm": 0.10791015625, "learning_rate": 0.0002, "loss": 1.3046, "step": 2154 }, { "epoch": 0.6763701989093335, "grad_norm": 0.10595703125, "learning_rate": 0.0002, "loss": 1.4713, "step": 2155 }, { "epoch": 0.6766840597904978, "grad_norm": 0.12109375, "learning_rate": 0.0002, "loss": 1.4688, "step": 2156 }, { "epoch": 0.6769979206716623, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.3407, "step": 2157 }, { "epoch": 0.6773117815528267, "grad_norm": 0.1171875, "learning_rate": 0.0002, "loss": 1.426, "step": 2158 }, { "epoch": 0.6776256424339911, "grad_norm": 0.123046875, "learning_rate": 0.0002, "loss": 1.3701, "step": 2159 }, { "epoch": 0.6779395033151555, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.3456, "step": 2160 }, { "epoch": 0.67825336419632, "grad_norm": 0.12353515625, "learning_rate": 0.0002, "loss": 1.2171, "step": 2161 }, { "epoch": 0.6785672250774845, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.545, "step": 2162 }, { "epoch": 0.6788810859586488, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.3447, "step": 2163 }, { "epoch": 0.6791949468398133, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.4775, "step": 2164 }, { "epoch": 0.6795088077209777, "grad_norm": 0.197265625, "learning_rate": 0.0002, "loss": 1.4556, "step": 2165 }, { "epoch": 0.6798226686021421, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.5541, "step": 2166 }, { "epoch": 0.6801365294833065, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.4576, "step": 2167 }, { "epoch": 0.680450390364471, "grad_norm": 0.130859375, "learning_rate": 0.0002, "loss": 1.2738, "step": 2168 }, { "epoch": 0.6807642512456353, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6087, "step": 2169 }, { "epoch": 0.6810781121267998, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.3709, "step": 2170 }, { "epoch": 0.6813919730079642, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.4467, "step": 2171 }, { "epoch": 0.6817058338891286, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.5383, "step": 2172 }, { "epoch": 0.6820196947702931, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.6602, "step": 2173 }, { "epoch": 0.6823335556514575, "grad_norm": 0.189453125, "learning_rate": 0.0002, "loss": 1.5281, "step": 2174 }, { "epoch": 0.682647416532622, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.5583, "step": 2175 }, { "epoch": 0.6829612774137863, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.7739, "step": 2176 }, { "epoch": 0.6832751382949508, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 1.5948, "step": 2177 }, { "epoch": 0.6835889991761152, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.7775, "step": 2178 }, { "epoch": 0.6839028600572796, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 1.4536, "step": 2179 }, { "epoch": 0.684216720938444, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.951, "step": 2180 }, { "epoch": 0.6845305818196085, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.9433, "step": 2181 }, { "epoch": 0.6848444427007729, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 2.1355, "step": 2182 }, { "epoch": 0.6851583035819373, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 2.168, "step": 2183 }, { "epoch": 0.6854721644631018, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 2.1087, "step": 2184 }, { "epoch": 0.6857860253442661, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 2.5491, "step": 2185 }, { "epoch": 0.6860998862254306, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 2.0621, "step": 2186 }, { "epoch": 0.686413747106595, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 2.376, "step": 2187 }, { "epoch": 0.6867276079877594, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 2.5973, "step": 2188 }, { "epoch": 0.6870414688689238, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.8221, "step": 2189 }, { "epoch": 0.6873553297500883, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 2.0147, "step": 2190 }, { "epoch": 0.6876691906312526, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 2.1272, "step": 2191 }, { "epoch": 0.6879830515124171, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 2.0319, "step": 2192 }, { "epoch": 0.6882969123935816, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 2.3523, "step": 2193 }, { "epoch": 0.688610773274746, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 1.8954, "step": 2194 }, { "epoch": 0.6889246341559104, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 2.0146, "step": 2195 }, { "epoch": 0.6892384950370748, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.8074, "step": 2196 }, { "epoch": 0.6895523559182393, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 2.0586, "step": 2197 }, { "epoch": 0.6898662167994036, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 1.9138, "step": 2198 }, { "epoch": 0.6901800776805681, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 2.2715, "step": 2199 }, { "epoch": 0.6904939385617325, "grad_norm": 1.2734375, "learning_rate": 0.0002, "loss": 3.2919, "step": 2200 }, { "epoch": 0.6908077994428969, "grad_norm": 0.08984375, "learning_rate": 0.0002, "loss": 1.353, "step": 2201 }, { "epoch": 0.6911216603240613, "grad_norm": 0.0966796875, "learning_rate": 0.0002, "loss": 1.2476, "step": 2202 }, { "epoch": 0.6914355212052258, "grad_norm": 0.10009765625, "learning_rate": 0.0002, "loss": 1.2903, "step": 2203 }, { "epoch": 0.6917493820863903, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.4513, "step": 2204 }, { "epoch": 0.6920632429675546, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.3309, "step": 2205 }, { "epoch": 0.6923771038487191, "grad_norm": 0.193359375, "learning_rate": 0.0002, "loss": 1.5383, "step": 2206 }, { "epoch": 0.6926909647298835, "grad_norm": 0.12451171875, "learning_rate": 0.0002, "loss": 1.3742, "step": 2207 }, { "epoch": 0.6930048256110479, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.4576, "step": 2208 }, { "epoch": 0.6933186864922123, "grad_norm": 0.126953125, "learning_rate": 0.0002, "loss": 1.3171, "step": 2209 }, { "epoch": 0.6936325473733768, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.3857, "step": 2210 }, { "epoch": 0.6939464082545411, "grad_norm": 0.11962890625, "learning_rate": 0.0002, "loss": 1.4446, "step": 2211 }, { "epoch": 0.6942602691357056, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.471, "step": 2212 }, { "epoch": 0.6945741300168701, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.3299, "step": 2213 }, { "epoch": 0.6948879908980344, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.4443, "step": 2214 }, { "epoch": 0.6952018517791989, "grad_norm": 0.203125, "learning_rate": 0.0002, "loss": 1.371, "step": 2215 }, { "epoch": 0.6955157126603633, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.3477, "step": 2216 }, { "epoch": 0.6958295735415277, "grad_norm": 0.22265625, "learning_rate": 0.0002, "loss": 1.3967, "step": 2217 }, { "epoch": 0.6961434344226921, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.5423, "step": 2218 }, { "epoch": 0.6964572953038566, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.5446, "step": 2219 }, { "epoch": 0.696771156185021, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.5536, "step": 2220 }, { "epoch": 0.6970850170661854, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.3579, "step": 2221 }, { "epoch": 0.6973988779473498, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.2884, "step": 2222 }, { "epoch": 0.6977127388285143, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.9048, "step": 2223 }, { "epoch": 0.6980265997096787, "grad_norm": 0.2451171875, "learning_rate": 0.0002, "loss": 1.5609, "step": 2224 }, { "epoch": 0.6983404605908431, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.4689, "step": 2225 }, { "epoch": 0.6986543214720076, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.5317, "step": 2226 }, { "epoch": 0.6989681823531719, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.7305, "step": 2227 }, { "epoch": 0.6992820432343364, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.7872, "step": 2228 }, { "epoch": 0.6995959041155008, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.4927, "step": 2229 }, { "epoch": 0.6999097649966652, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 2.1224, "step": 2230 }, { "epoch": 0.7002236258778296, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 1.929, "step": 2231 }, { "epoch": 0.7005374867589941, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 2.4381, "step": 2232 }, { "epoch": 0.7008513476401586, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 2.2564, "step": 2233 }, { "epoch": 0.7011652085213229, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 2.7457, "step": 2234 }, { "epoch": 0.7014790694024874, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 2.2323, "step": 2235 }, { "epoch": 0.7017929302836518, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 2.4091, "step": 2236 }, { "epoch": 0.7021067911648162, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 2.3004, "step": 2237 }, { "epoch": 0.7024206520459806, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 2.1265, "step": 2238 }, { "epoch": 0.7027345129271451, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 2.4745, "step": 2239 }, { "epoch": 0.7030483738083094, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.9056, "step": 2240 }, { "epoch": 0.7033622346894739, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 1.9232, "step": 2241 }, { "epoch": 0.7036760955706383, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 2.3736, "step": 2242 }, { "epoch": 0.7039899564518027, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 2.5, "step": 2243 }, { "epoch": 0.7043038173329672, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 2.1852, "step": 2244 }, { "epoch": 0.7043038173329672, "eval_loss": 1.7769441604614258, "eval_runtime": 123.5225, "eval_samples_per_second": 8.096, "eval_steps_per_second": 8.096, "step": 2244 }, { "epoch": 0.7043038173329672, "mmlu_eval_accuracy": 0.4180599978588911, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.6363636363636364, "mmlu_eval_accuracy_clinical_knowledge": 0.41379310344827586, "mmlu_eval_accuracy_college_biology": 0.1875, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.45454545454545453, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.3181818181818182, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.2727272727272727, "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.5, "mmlu_eval_accuracy_elementary_mathematics": 0.3170731707317073, "mmlu_eval_accuracy_formal_logic": 0.2857142857142857, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.375, "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, "mmlu_eval_accuracy_high_school_computer_science": 0.4444444444444444, "mmlu_eval_accuracy_high_school_european_history": 0.6111111111111112, "mmlu_eval_accuracy_high_school_geography": 0.6363636363636364, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5714285714285714, "mmlu_eval_accuracy_high_school_macroeconomics": 0.37209302325581395, "mmlu_eval_accuracy_high_school_mathematics": 0.3103448275862069, "mmlu_eval_accuracy_high_school_microeconomics": 0.23076923076923078, "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, "mmlu_eval_accuracy_high_school_psychology": 0.6, "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, "mmlu_eval_accuracy_high_school_us_history": 0.5454545454545454, "mmlu_eval_accuracy_high_school_world_history": 0.5384615384615384, "mmlu_eval_accuracy_human_aging": 0.7391304347826086, "mmlu_eval_accuracy_human_sexuality": 0.4166666666666667, "mmlu_eval_accuracy_international_law": 0.6923076923076923, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.6111111111111112, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.76, "mmlu_eval_accuracy_medical_genetics": 0.8181818181818182, "mmlu_eval_accuracy_miscellaneous": 0.5930232558139535, "mmlu_eval_accuracy_moral_disputes": 0.3684210526315789, "mmlu_eval_accuracy_moral_scenarios": 0.23, "mmlu_eval_accuracy_nutrition": 0.3939393939393939, "mmlu_eval_accuracy_philosophy": 0.47058823529411764, "mmlu_eval_accuracy_prehistory": 0.34285714285714286, "mmlu_eval_accuracy_professional_accounting": 0.3870967741935484, "mmlu_eval_accuracy_professional_law": 0.3235294117647059, "mmlu_eval_accuracy_professional_medicine": 0.41935483870967744, "mmlu_eval_accuracy_professional_psychology": 0.37681159420289856, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.4074074074074074, "mmlu_eval_accuracy_sociology": 0.5, "mmlu_eval_accuracy_us_foreign_policy": 0.5454545454545454, "mmlu_eval_accuracy_virology": 0.4444444444444444, "mmlu_eval_accuracy_world_religions": 0.631578947368421, "mmlu_loss": 1.1384575194116513, "step": 2244 }, { "epoch": 0.7046176782141316, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 2.0939, "step": 2245 }, { "epoch": 0.704931539095296, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.8281, "step": 2246 }, { "epoch": 0.7052453999764604, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 2.0069, "step": 2247 }, { "epoch": 0.7055592608576249, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 2.0889, "step": 2248 }, { "epoch": 0.7058731217387892, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 2.6987, "step": 2249 }, { "epoch": 0.7061869826199537, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 2.7403, "step": 2250 }, { "epoch": 0.7065008435011181, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.457, "step": 2251 }, { "epoch": 0.7068147043822826, "grad_norm": 0.1005859375, "learning_rate": 0.0002, "loss": 1.3027, "step": 2252 }, { "epoch": 0.707128565263447, "grad_norm": 0.12353515625, "learning_rate": 0.0002, "loss": 1.4681, "step": 2253 }, { "epoch": 0.7074424261446114, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.3744, "step": 2254 }, { "epoch": 0.7077562870257759, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.6237, "step": 2255 }, { "epoch": 0.7080701479069402, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.4545, "step": 2256 }, { "epoch": 0.7083840087881047, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.4803, "step": 2257 }, { "epoch": 0.7086978696692691, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.4537, "step": 2258 }, { "epoch": 0.7090117305504335, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 1.708, "step": 2259 }, { "epoch": 0.7093255914315979, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 1.5124, "step": 2260 }, { "epoch": 0.7096394523127624, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.4564, "step": 2261 }, { "epoch": 0.7099533131939267, "grad_norm": 0.126953125, "learning_rate": 0.0002, "loss": 1.3397, "step": 2262 }, { "epoch": 0.7102671740750912, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.3918, "step": 2263 }, { "epoch": 0.7105810349562557, "grad_norm": 0.1181640625, "learning_rate": 0.0002, "loss": 1.2112, "step": 2264 }, { "epoch": 0.71089489583742, "grad_norm": 0.126953125, "learning_rate": 0.0002, "loss": 1.2941, "step": 2265 }, { "epoch": 0.7112087567185845, "grad_norm": 0.1923828125, "learning_rate": 0.0002, "loss": 1.5815, "step": 2266 }, { "epoch": 0.7115226175997489, "grad_norm": 0.208984375, "learning_rate": 0.0002, "loss": 1.5319, "step": 2267 }, { "epoch": 0.7118364784809134, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.4616, "step": 2268 }, { "epoch": 0.7121503393620777, "grad_norm": 0.232421875, "learning_rate": 0.0002, "loss": 1.6004, "step": 2269 }, { "epoch": 0.7124642002432422, "grad_norm": 0.189453125, "learning_rate": 0.0002, "loss": 1.3365, "step": 2270 }, { "epoch": 0.7127780611244066, "grad_norm": 0.2412109375, "learning_rate": 0.0002, "loss": 1.4398, "step": 2271 }, { "epoch": 0.713091922005571, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.4959, "step": 2272 }, { "epoch": 0.7134057828867355, "grad_norm": 0.21484375, "learning_rate": 0.0002, "loss": 1.8303, "step": 2273 }, { "epoch": 0.7137196437678999, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.8074, "step": 2274 }, { "epoch": 0.7140335046490643, "grad_norm": 0.248046875, "learning_rate": 0.0002, "loss": 1.5667, "step": 2275 }, { "epoch": 0.7143473655302287, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.8409, "step": 2276 }, { "epoch": 0.7146612264113932, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.9792, "step": 2277 }, { "epoch": 0.7149750872925575, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.9356, "step": 2278 }, { "epoch": 0.715288948173722, "grad_norm": 0.236328125, "learning_rate": 0.0002, "loss": 1.7637, "step": 2279 }, { "epoch": 0.7156028090548864, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.5164, "step": 2280 }, { "epoch": 0.7159166699360509, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.7661, "step": 2281 }, { "epoch": 0.7162305308172152, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 2.1665, "step": 2282 }, { "epoch": 0.7165443916983797, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 2.0457, "step": 2283 }, { "epoch": 0.7168582525795442, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 2.7448, "step": 2284 }, { "epoch": 0.7171721134607085, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 1.9213, "step": 2285 }, { "epoch": 0.717485974341873, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 2.2519, "step": 2286 }, { "epoch": 0.7177998352230374, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 1.6954, "step": 2287 }, { "epoch": 0.7181136961042018, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 2.1847, "step": 2288 }, { "epoch": 0.7184275569853662, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 2.3177, "step": 2289 }, { "epoch": 0.7187414178665307, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 2.7715, "step": 2290 }, { "epoch": 0.719055278747695, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 2.1505, "step": 2291 }, { "epoch": 0.7193691396288595, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 2.0164, "step": 2292 }, { "epoch": 0.719683000510024, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 2.3602, "step": 2293 }, { "epoch": 0.7199968613911883, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 2.0161, "step": 2294 }, { "epoch": 0.7203107222723528, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 2.4021, "step": 2295 }, { "epoch": 0.7206245831535172, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 2.5295, "step": 2296 }, { "epoch": 0.7209384440346817, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 2.341, "step": 2297 }, { "epoch": 0.721252304915846, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 1.7983, "step": 2298 }, { "epoch": 0.7215661657970105, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 2.3877, "step": 2299 }, { "epoch": 0.7218800266781749, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 2.8016, "step": 2300 }, { "epoch": 0.7221938875593393, "grad_norm": 0.0791015625, "learning_rate": 0.0002, "loss": 1.2162, "step": 2301 }, { "epoch": 0.7225077484405037, "grad_norm": 0.1181640625, "learning_rate": 0.0002, "loss": 1.3024, "step": 2302 }, { "epoch": 0.7228216093216682, "grad_norm": 0.111328125, "learning_rate": 0.0002, "loss": 1.4018, "step": 2303 }, { "epoch": 0.7231354702028326, "grad_norm": 0.1005859375, "learning_rate": 0.0002, "loss": 1.3134, "step": 2304 }, { "epoch": 0.723449331083997, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.431, "step": 2305 }, { "epoch": 0.7237631919651615, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.3435, "step": 2306 }, { "epoch": 0.7240770528463258, "grad_norm": 0.1328125, "learning_rate": 0.0002, "loss": 1.5706, "step": 2307 }, { "epoch": 0.7243909137274903, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.4139, "step": 2308 }, { "epoch": 0.7247047746086547, "grad_norm": 0.1328125, "learning_rate": 0.0002, "loss": 1.3822, "step": 2309 }, { "epoch": 0.7250186354898192, "grad_norm": 0.1318359375, "learning_rate": 0.0002, "loss": 1.5015, "step": 2310 }, { "epoch": 0.7253324963709835, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.3385, "step": 2311 }, { "epoch": 0.725646357252148, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.4255, "step": 2312 }, { "epoch": 0.7259602181333125, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.3187, "step": 2313 }, { "epoch": 0.7262740790144768, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.2845, "step": 2314 }, { "epoch": 0.7265879398956413, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.3354, "step": 2315 }, { "epoch": 0.7269018007768057, "grad_norm": 0.1279296875, "learning_rate": 0.0002, "loss": 1.3538, "step": 2316 }, { "epoch": 0.7272156616579701, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.624, "step": 2317 }, { "epoch": 0.7275295225391345, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.4957, "step": 2318 }, { "epoch": 0.727843383420299, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.4016, "step": 2319 }, { "epoch": 0.7281572443014633, "grad_norm": 0.19140625, "learning_rate": 0.0002, "loss": 1.5854, "step": 2320 }, { "epoch": 0.7284711051826278, "grad_norm": 0.220703125, "learning_rate": 0.0002, "loss": 1.6723, "step": 2321 }, { "epoch": 0.7287849660637922, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.6414, "step": 2322 }, { "epoch": 0.7290988269449566, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.6206, "step": 2323 }, { "epoch": 0.7294126878261211, "grad_norm": 0.2314453125, "learning_rate": 0.0002, "loss": 1.8653, "step": 2324 }, { "epoch": 0.7297265487072855, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.9866, "step": 2325 }, { "epoch": 0.73004040958845, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.774, "step": 2326 }, { "epoch": 0.7303542704696143, "grad_norm": 0.19921875, "learning_rate": 0.0002, "loss": 1.601, "step": 2327 }, { "epoch": 0.7306681313507788, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.6226, "step": 2328 }, { "epoch": 0.7309819922319432, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.9242, "step": 2329 }, { "epoch": 0.7312958531131076, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 1.6306, "step": 2330 }, { "epoch": 0.731609713994272, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 1.917, "step": 2331 }, { "epoch": 0.7319235748754365, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 1.7724, "step": 2332 }, { "epoch": 0.7322374357566009, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 2.172, "step": 2333 }, { "epoch": 0.7325512966377653, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 2.1364, "step": 2334 }, { "epoch": 0.7328651575189298, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 2.1089, "step": 2335 }, { "epoch": 0.7331790184000941, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 2.3062, "step": 2336 }, { "epoch": 0.7334928792812586, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 2.3192, "step": 2337 }, { "epoch": 0.733806740162423, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 2.1375, "step": 2338 }, { "epoch": 0.7341206010435875, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 2.6586, "step": 2339 }, { "epoch": 0.7344344619247518, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 2.358, "step": 2340 }, { "epoch": 0.7347483228059163, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.8156, "step": 2341 }, { "epoch": 0.7350621836870807, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 2.6493, "step": 2342 }, { "epoch": 0.7353760445682451, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 2.01, "step": 2343 }, { "epoch": 0.7356899054494096, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 2.1826, "step": 2344 }, { "epoch": 0.736003766330574, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 2.1244, "step": 2345 }, { "epoch": 0.7363176272117384, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.8929, "step": 2346 }, { "epoch": 0.7366314880929028, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.9275, "step": 2347 }, { "epoch": 0.7369453489740673, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 2.1545, "step": 2348 }, { "epoch": 0.7372592098552316, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 2.342, "step": 2349 }, { "epoch": 0.7375730707363961, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 2.5309, "step": 2350 }, { "epoch": 0.7378869316175605, "grad_norm": 0.09619140625, "learning_rate": 0.0002, "loss": 1.2762, "step": 2351 }, { "epoch": 0.738200792498725, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.3485, "step": 2352 }, { "epoch": 0.7385146533798894, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.3308, "step": 2353 }, { "epoch": 0.7388285142610538, "grad_norm": 0.1328125, "learning_rate": 0.0002, "loss": 1.4661, "step": 2354 }, { "epoch": 0.7391423751422183, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5322, "step": 2355 }, { "epoch": 0.7394562360233826, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 1.5151, "step": 2356 }, { "epoch": 0.7397700969045471, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.567, "step": 2357 }, { "epoch": 0.7400839577857115, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.5931, "step": 2358 }, { "epoch": 0.7403978186668759, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.4611, "step": 2359 }, { "epoch": 0.7407116795480403, "grad_norm": 0.2001953125, "learning_rate": 0.0002, "loss": 1.3765, "step": 2360 }, { "epoch": 0.7410255404292048, "grad_norm": 0.126953125, "learning_rate": 0.0002, "loss": 1.3375, "step": 2361 }, { "epoch": 0.7413394013103691, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.4999, "step": 2362 }, { "epoch": 0.7416532621915336, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.4183, "step": 2363 }, { "epoch": 0.7419671230726981, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.3219, "step": 2364 }, { "epoch": 0.7422809839538624, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.2917, "step": 2365 }, { "epoch": 0.7425948448350269, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.4742, "step": 2366 }, { "epoch": 0.7429087057161913, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.6195, "step": 2367 }, { "epoch": 0.7432225665973557, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.3573, "step": 2368 }, { "epoch": 0.7435364274785201, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 1.3846, "step": 2369 }, { "epoch": 0.7438502883596846, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.4372, "step": 2370 }, { "epoch": 0.744164149240849, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.4481, "step": 2371 }, { "epoch": 0.7444780101220134, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.5518, "step": 2372 }, { "epoch": 0.7447918710031779, "grad_norm": 0.21484375, "learning_rate": 0.0002, "loss": 1.2905, "step": 2373 }, { "epoch": 0.7451057318843423, "grad_norm": 0.224609375, "learning_rate": 0.0002, "loss": 1.5086, "step": 2374 }, { "epoch": 0.7454195927655067, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.8025, "step": 2375 }, { "epoch": 0.7457334536466711, "grad_norm": 0.2470703125, "learning_rate": 0.0002, "loss": 1.5683, "step": 2376 }, { "epoch": 0.7460473145278356, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 2.157, "step": 2377 }, { "epoch": 0.7463611754089999, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.8266, "step": 2378 }, { "epoch": 0.7466750362901644, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.8815, "step": 2379 }, { "epoch": 0.7469888971713288, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.8668, "step": 2380 }, { "epoch": 0.7473027580524932, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.9803, "step": 2381 }, { "epoch": 0.7476166189336576, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.9043, "step": 2382 }, { "epoch": 0.7479304798148221, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 1.9333, "step": 2383 }, { "epoch": 0.7482443406959866, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 1.9732, "step": 2384 }, { "epoch": 0.7485582015771509, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.9008, "step": 2385 }, { "epoch": 0.7488720624583154, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 2.2057, "step": 2386 }, { "epoch": 0.7491859233394798, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 2.0298, "step": 2387 }, { "epoch": 0.7494997842206442, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 2.1335, "step": 2388 }, { "epoch": 0.7498136451018086, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 2.3354, "step": 2389 }, { "epoch": 0.7501275059829731, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 2.044, "step": 2390 }, { "epoch": 0.7504413668641374, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 2.3769, "step": 2391 }, { "epoch": 0.7507552277453019, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 2.2939, "step": 2392 }, { "epoch": 0.7510690886264663, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 2.0112, "step": 2393 }, { "epoch": 0.7513829495076307, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 1.8882, "step": 2394 }, { "epoch": 0.7516968103887952, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.8986, "step": 2395 }, { "epoch": 0.7520106712699596, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 1.9775, "step": 2396 }, { "epoch": 0.752324532151124, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.8405, "step": 2397 }, { "epoch": 0.7526383930322884, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 2.4113, "step": 2398 }, { "epoch": 0.7529522539134529, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 2.4571, "step": 2399 }, { "epoch": 0.7532661147946172, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 3.0139, "step": 2400 }, { "epoch": 0.7535799756757817, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.2474, "step": 2401 }, { "epoch": 0.7538938365569461, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5637, "step": 2402 }, { "epoch": 0.7542076974381106, "grad_norm": 0.1279296875, "learning_rate": 0.0002, "loss": 1.4162, "step": 2403 }, { "epoch": 0.754521558319275, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.4733, "step": 2404 }, { "epoch": 0.7548354192004394, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.4128, "step": 2405 }, { "epoch": 0.7551492800816039, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.2692, "step": 2406 }, { "epoch": 0.7554631409627682, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.3874, "step": 2407 }, { "epoch": 0.7557770018439327, "grad_norm": 0.130859375, "learning_rate": 0.0002, "loss": 1.4692, "step": 2408 }, { "epoch": 0.7560908627250971, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 1.528, "step": 2409 }, { "epoch": 0.7564047236062615, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.5061, "step": 2410 }, { "epoch": 0.7567185844874259, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 1.5193, "step": 2411 }, { "epoch": 0.7570324453685904, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 1.5211, "step": 2412 }, { "epoch": 0.7573463062497547, "grad_norm": 0.1328125, "learning_rate": 0.0002, "loss": 1.4203, "step": 2413 }, { "epoch": 0.7576601671309192, "grad_norm": 0.19140625, "learning_rate": 0.0002, "loss": 1.6103, "step": 2414 }, { "epoch": 0.7579740280120837, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.5367, "step": 2415 }, { "epoch": 0.758287888893248, "grad_norm": 0.2138671875, "learning_rate": 0.0002, "loss": 1.3625, "step": 2416 }, { "epoch": 0.7586017497744125, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.4703, "step": 2417 }, { "epoch": 0.7589156106555769, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.362, "step": 2418 }, { "epoch": 0.7592294715367414, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.5297, "step": 2419 }, { "epoch": 0.7595433324179057, "grad_norm": 0.2158203125, "learning_rate": 0.0002, "loss": 1.6779, "step": 2420 }, { "epoch": 0.7598571932990702, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.5044, "step": 2421 }, { "epoch": 0.7601710541802346, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.7196, "step": 2422 }, { "epoch": 0.760484915061399, "grad_norm": 0.189453125, "learning_rate": 0.0002, "loss": 1.6114, "step": 2423 }, { "epoch": 0.7607987759425635, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 1.4659, "step": 2424 }, { "epoch": 0.7611126368237279, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.7074, "step": 2425 }, { "epoch": 0.7614264977048923, "grad_norm": 0.2216796875, "learning_rate": 0.0002, "loss": 1.4459, "step": 2426 }, { "epoch": 0.7617403585860567, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.8313, "step": 2427 }, { "epoch": 0.7620542194672212, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.8035, "step": 2428 }, { "epoch": 0.7623680803483855, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 2.2824, "step": 2429 }, { "epoch": 0.76268194122955, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 1.9295, "step": 2430 }, { "epoch": 0.7629958021107144, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 2.6543, "step": 2431 }, { "epoch": 0.7629958021107144, "eval_loss": 1.7652301788330078, "eval_runtime": 122.9161, "eval_samples_per_second": 8.136, "eval_steps_per_second": 8.136, "step": 2431 }, { "epoch": 0.7629958021107144, "mmlu_eval_accuracy": 0.42014877718164845, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, "mmlu_eval_accuracy_clinical_knowledge": 0.4482758620689655, "mmlu_eval_accuracy_college_biology": 0.1875, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.5454545454545454, "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.25, "mmlu_eval_accuracy_elementary_mathematics": 0.3170731707317073, "mmlu_eval_accuracy_formal_logic": 0.2857142857142857, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.375, "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, "mmlu_eval_accuracy_high_school_computer_science": 0.4444444444444444, "mmlu_eval_accuracy_high_school_european_history": 0.6666666666666666, "mmlu_eval_accuracy_high_school_geography": 0.5909090909090909, "mmlu_eval_accuracy_high_school_government_and_politics": 0.6190476190476191, "mmlu_eval_accuracy_high_school_macroeconomics": 0.37209302325581395, "mmlu_eval_accuracy_high_school_mathematics": 0.27586206896551724, "mmlu_eval_accuracy_high_school_microeconomics": 0.23076923076923078, "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, "mmlu_eval_accuracy_high_school_psychology": 0.5833333333333334, "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, "mmlu_eval_accuracy_high_school_us_history": 0.5454545454545454, "mmlu_eval_accuracy_high_school_world_history": 0.4230769230769231, "mmlu_eval_accuracy_human_aging": 0.6956521739130435, "mmlu_eval_accuracy_human_sexuality": 0.5833333333333334, "mmlu_eval_accuracy_international_law": 0.6923076923076923, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.6111111111111112, "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.88, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.5930232558139535, "mmlu_eval_accuracy_moral_disputes": 0.3684210526315789, "mmlu_eval_accuracy_moral_scenarios": 0.26, "mmlu_eval_accuracy_nutrition": 0.3939393939393939, "mmlu_eval_accuracy_philosophy": 0.47058823529411764, "mmlu_eval_accuracy_prehistory": 0.3142857142857143, "mmlu_eval_accuracy_professional_accounting": 0.3870967741935484, "mmlu_eval_accuracy_professional_law": 0.31176470588235294, "mmlu_eval_accuracy_professional_medicine": 0.3548387096774194, "mmlu_eval_accuracy_professional_psychology": 0.36231884057971014, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.4074074074074074, "mmlu_eval_accuracy_sociology": 0.5, "mmlu_eval_accuracy_us_foreign_policy": 0.45454545454545453, "mmlu_eval_accuracy_virology": 0.3888888888888889, "mmlu_eval_accuracy_world_religions": 0.6842105263157895, "mmlu_loss": 1.2213488024173416, "step": 2431 }, { "epoch": 0.7633096629918789, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 2.2962, "step": 2432 }, { "epoch": 0.7636235238730432, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 2.0499, "step": 2433 }, { "epoch": 0.7639373847542077, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 2.2559, "step": 2434 }, { "epoch": 0.7642512456353722, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 2.0198, "step": 2435 }, { "epoch": 0.7645651065165365, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.8019, "step": 2436 }, { "epoch": 0.764878967397701, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 1.8726, "step": 2437 }, { "epoch": 0.7651928282788654, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 2.0747, "step": 2438 }, { "epoch": 0.7655066891600298, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 2.5202, "step": 2439 }, { "epoch": 0.7658205500411942, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 2.3145, "step": 2440 }, { "epoch": 0.7661344109223587, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 2.0447, "step": 2441 }, { "epoch": 0.766448271803523, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 2.6057, "step": 2442 }, { "epoch": 0.7667621326846875, "grad_norm": 1.4765625, "learning_rate": 0.0002, "loss": 2.1988, "step": 2443 }, { "epoch": 0.767075993565852, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 2.3199, "step": 2444 }, { "epoch": 0.7673898544470164, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 2.1278, "step": 2445 }, { "epoch": 0.7677037153281808, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 2.4532, "step": 2446 }, { "epoch": 0.7680175762093452, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 2.0989, "step": 2447 }, { "epoch": 0.7683314370905097, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 1.963, "step": 2448 }, { "epoch": 0.768645297971674, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 2.337, "step": 2449 }, { "epoch": 0.7689591588528385, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 2.7283, "step": 2450 }, { "epoch": 0.7692730197340029, "grad_norm": 0.1142578125, "learning_rate": 0.0002, "loss": 1.3662, "step": 2451 }, { "epoch": 0.7695868806151673, "grad_norm": 0.10107421875, "learning_rate": 0.0002, "loss": 1.3258, "step": 2452 }, { "epoch": 0.7699007414963317, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.3579, "step": 2453 }, { "epoch": 0.7702146023774962, "grad_norm": 0.12109375, "learning_rate": 0.0002, "loss": 1.5144, "step": 2454 }, { "epoch": 0.7705284632586606, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.2933, "step": 2455 }, { "epoch": 0.770842324139825, "grad_norm": 0.1328125, "learning_rate": 0.0002, "loss": 1.4465, "step": 2456 }, { "epoch": 0.7711561850209895, "grad_norm": 0.11474609375, "learning_rate": 0.0002, "loss": 1.2679, "step": 2457 }, { "epoch": 0.7714700459021538, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.2474, "step": 2458 }, { "epoch": 0.7717839067833183, "grad_norm": 0.11865234375, "learning_rate": 0.0002, "loss": 1.358, "step": 2459 }, { "epoch": 0.7720977676644827, "grad_norm": 0.12109375, "learning_rate": 0.0002, "loss": 1.2669, "step": 2460 }, { "epoch": 0.7724116285456472, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.5163, "step": 2461 }, { "epoch": 0.7727254894268115, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.2793, "step": 2462 }, { "epoch": 0.773039350307976, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.6406, "step": 2463 }, { "epoch": 0.7733532111891405, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.357, "step": 2464 }, { "epoch": 0.7736670720703048, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5362, "step": 2465 }, { "epoch": 0.7739809329514693, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.4644, "step": 2466 }, { "epoch": 0.7742947938326337, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.4558, "step": 2467 }, { "epoch": 0.7746086547137981, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.3939, "step": 2468 }, { "epoch": 0.7749225155949625, "grad_norm": 0.1953125, "learning_rate": 0.0002, "loss": 1.576, "step": 2469 }, { "epoch": 0.775236376476127, "grad_norm": 0.1943359375, "learning_rate": 0.0002, "loss": 1.6347, "step": 2470 }, { "epoch": 0.7755502373572913, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.3226, "step": 2471 }, { "epoch": 0.7758640982384558, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.3418, "step": 2472 }, { "epoch": 0.7761779591196202, "grad_norm": 0.234375, "learning_rate": 0.0002, "loss": 1.723, "step": 2473 }, { "epoch": 0.7764918200007846, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.5778, "step": 2474 }, { "epoch": 0.7768056808819491, "grad_norm": 0.212890625, "learning_rate": 0.0002, "loss": 1.7445, "step": 2475 }, { "epoch": 0.7771195417631135, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.6642, "step": 2476 }, { "epoch": 0.777433402644278, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.7149, "step": 2477 }, { "epoch": 0.7777472635254423, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 1.974, "step": 2478 }, { "epoch": 0.7780611244066068, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.8387, "step": 2479 }, { "epoch": 0.7783749852877712, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 2.1841, "step": 2480 }, { "epoch": 0.7786888461689356, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.6258, "step": 2481 }, { "epoch": 0.7790027070501, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 2.3414, "step": 2482 }, { "epoch": 0.7793165679312645, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 2.6989, "step": 2483 }, { "epoch": 0.7796304288124289, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 2.2343, "step": 2484 }, { "epoch": 0.7799442896935933, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 2.5767, "step": 2485 }, { "epoch": 0.7802581505747578, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 2.1286, "step": 2486 }, { "epoch": 0.7805720114559221, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 2.1996, "step": 2487 }, { "epoch": 0.7808858723370866, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.9517, "step": 2488 }, { "epoch": 0.781199733218251, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 2.0916, "step": 2489 }, { "epoch": 0.7815135940994155, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 2.1751, "step": 2490 }, { "epoch": 0.7818274549805798, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 1.9032, "step": 2491 }, { "epoch": 0.7821413158617443, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 2.132, "step": 2492 }, { "epoch": 0.7824551767429087, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 2.3776, "step": 2493 }, { "epoch": 0.7827690376240731, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 2.0207, "step": 2494 }, { "epoch": 0.7830828985052376, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 2.0179, "step": 2495 }, { "epoch": 0.783396759386402, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 2.2601, "step": 2496 }, { "epoch": 0.7837106202675664, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 2.2889, "step": 2497 }, { "epoch": 0.7840244811487308, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 1.9265, "step": 2498 }, { "epoch": 0.7843383420298953, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 2.3361, "step": 2499 }, { "epoch": 0.7846522029110596, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 2.7706, "step": 2500 }, { "epoch": 0.7849660637922241, "grad_norm": 0.1201171875, "learning_rate": 0.0002, "loss": 1.386, "step": 2501 }, { "epoch": 0.7852799246733885, "grad_norm": 0.111328125, "learning_rate": 0.0002, "loss": 1.3792, "step": 2502 }, { "epoch": 0.785593785554553, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.3294, "step": 2503 }, { "epoch": 0.7859076464357174, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.4478, "step": 2504 }, { "epoch": 0.7862215073168818, "grad_norm": 0.1923828125, "learning_rate": 0.0002, "loss": 1.5299, "step": 2505 }, { "epoch": 0.7865353681980463, "grad_norm": 0.2109375, "learning_rate": 0.0002, "loss": 1.4589, "step": 2506 }, { "epoch": 0.7868492290792106, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.5182, "step": 2507 }, { "epoch": 0.7871630899603751, "grad_norm": 0.19140625, "learning_rate": 0.0002, "loss": 1.4176, "step": 2508 }, { "epoch": 0.7874769508415395, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.4129, "step": 2509 }, { "epoch": 0.7877908117227039, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.5108, "step": 2510 }, { "epoch": 0.7881046726038683, "grad_norm": 0.21484375, "learning_rate": 0.0002, "loss": 1.4665, "step": 2511 }, { "epoch": 0.7884185334850328, "grad_norm": 0.1279296875, "learning_rate": 0.0002, "loss": 1.5009, "step": 2512 }, { "epoch": 0.7887323943661971, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.4032, "step": 2513 }, { "epoch": 0.7890462552473616, "grad_norm": 0.1298828125, "learning_rate": 0.0002, "loss": 1.3496, "step": 2514 }, { "epoch": 0.7893601161285261, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.4753, "step": 2515 }, { "epoch": 0.7896739770096904, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5218, "step": 2516 }, { "epoch": 0.7899878378908549, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.4646, "step": 2517 }, { "epoch": 0.7903016987720193, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.3674, "step": 2518 }, { "epoch": 0.7906155596531838, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.4775, "step": 2519 }, { "epoch": 0.7909294205343481, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.3576, "step": 2520 }, { "epoch": 0.7912432814155126, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 1.457, "step": 2521 }, { "epoch": 0.791557142296677, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.973, "step": 2522 }, { "epoch": 0.7918710031778414, "grad_norm": 0.2451171875, "learning_rate": 0.0002, "loss": 1.5673, "step": 2523 }, { "epoch": 0.7921848640590059, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 1.8701, "step": 2524 }, { "epoch": 0.7924987249401703, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.5442, "step": 2525 }, { "epoch": 0.7928125858213347, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 1.6074, "step": 2526 }, { "epoch": 0.7931264467024991, "grad_norm": 0.2275390625, "learning_rate": 0.0002, "loss": 1.6917, "step": 2527 }, { "epoch": 0.7934403075836636, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 1.9244, "step": 2528 }, { "epoch": 0.7937541684648279, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.7359, "step": 2529 }, { "epoch": 0.7940680293459924, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 2.3441, "step": 2530 }, { "epoch": 0.7943818902271568, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 2.0712, "step": 2531 }, { "epoch": 0.7946957511083212, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 2.2452, "step": 2532 }, { "epoch": 0.7950096119894856, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 2.1489, "step": 2533 }, { "epoch": 0.7953234728706501, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 2.1836, "step": 2534 }, { "epoch": 0.7956373337518146, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.7811, "step": 2535 }, { "epoch": 0.7959511946329789, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.8184, "step": 2536 }, { "epoch": 0.7962650555141434, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 1.8137, "step": 2537 }, { "epoch": 0.7965789163953078, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 2.1316, "step": 2538 }, { "epoch": 0.7968927772764722, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 1.976, "step": 2539 }, { "epoch": 0.7972066381576366, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 2.4813, "step": 2540 }, { "epoch": 0.7975204990388011, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 2.05, "step": 2541 }, { "epoch": 0.7978343599199654, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 2.0406, "step": 2542 }, { "epoch": 0.7981482208011299, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 2.1095, "step": 2543 }, { "epoch": 0.7984620816822944, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 2.3052, "step": 2544 }, { "epoch": 0.7987759425634587, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 2.2199, "step": 2545 }, { "epoch": 0.7990898034446232, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 2.0261, "step": 2546 }, { "epoch": 0.7994036643257876, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 1.8794, "step": 2547 }, { "epoch": 0.799717525206952, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 1.9501, "step": 2548 }, { "epoch": 0.8000313860881164, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 2.1907, "step": 2549 }, { "epoch": 0.8003452469692809, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 3.1421, "step": 2550 }, { "epoch": 0.8006591078504453, "grad_norm": 0.099609375, "learning_rate": 0.0002, "loss": 1.3507, "step": 2551 }, { "epoch": 0.8009729687316097, "grad_norm": 0.185546875, "learning_rate": 0.0002, "loss": 1.426, "step": 2552 }, { "epoch": 0.8012868296127741, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.312, "step": 2553 }, { "epoch": 0.8016006904939386, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.5689, "step": 2554 }, { "epoch": 0.801914551375103, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.3539, "step": 2555 }, { "epoch": 0.8022284122562674, "grad_norm": 0.1259765625, "learning_rate": 0.0002, "loss": 1.282, "step": 2556 }, { "epoch": 0.8025422731374319, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.3984, "step": 2557 }, { "epoch": 0.8028561340185962, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6132, "step": 2558 }, { "epoch": 0.8031699948997607, "grad_norm": 0.234375, "learning_rate": 0.0002, "loss": 1.3977, "step": 2559 }, { "epoch": 0.8034838557809251, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.4431, "step": 2560 }, { "epoch": 0.8037977166620895, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.4175, "step": 2561 }, { "epoch": 0.8041115775432539, "grad_norm": 0.2060546875, "learning_rate": 0.0002, "loss": 1.5591, "step": 2562 }, { "epoch": 0.8044254384244184, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.4017, "step": 2563 }, { "epoch": 0.8047392993055829, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5094, "step": 2564 }, { "epoch": 0.8050531601867472, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 1.4971, "step": 2565 }, { "epoch": 0.8053670210679117, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.4216, "step": 2566 }, { "epoch": 0.805680881949076, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.4232, "step": 2567 }, { "epoch": 0.8059947428302405, "grad_norm": 0.208984375, "learning_rate": 0.0002, "loss": 1.4799, "step": 2568 }, { "epoch": 0.8063086037114049, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.3335, "step": 2569 }, { "epoch": 0.8066224645925694, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.6929, "step": 2570 }, { "epoch": 0.8069363254737337, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 1.4864, "step": 2571 }, { "epoch": 0.8072501863548982, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.6533, "step": 2572 }, { "epoch": 0.8075640472360626, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.6576, "step": 2573 }, { "epoch": 0.807877908117227, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 1.8208, "step": 2574 }, { "epoch": 0.8081917689983915, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 1.7821, "step": 2575 }, { "epoch": 0.8085056298795559, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.7887, "step": 2576 }, { "epoch": 0.8088194907607203, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 1.761, "step": 2577 }, { "epoch": 0.8091333516418847, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.7492, "step": 2578 }, { "epoch": 0.8094472125230492, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.9048, "step": 2579 }, { "epoch": 0.8097610734042135, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 1.9207, "step": 2580 }, { "epoch": 0.810074934285378, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 2.1126, "step": 2581 }, { "epoch": 0.8103887951665424, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.9245, "step": 2582 }, { "epoch": 0.8107026560477069, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 2.1493, "step": 2583 }, { "epoch": 0.8110165169288712, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 2.2426, "step": 2584 }, { "epoch": 0.8113303778100357, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 2.4175, "step": 2585 }, { "epoch": 0.8116442386912002, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 1.9238, "step": 2586 }, { "epoch": 0.8119580995723645, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 1.9419, "step": 2587 }, { "epoch": 0.812271960453529, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 2.2005, "step": 2588 }, { "epoch": 0.8125858213346934, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.8211, "step": 2589 }, { "epoch": 0.8128996822158578, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.8112, "step": 2590 }, { "epoch": 0.8132135430970222, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 2.8194, "step": 2591 }, { "epoch": 0.8135274039781867, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.9225, "step": 2592 }, { "epoch": 0.813841264859351, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 2.1797, "step": 2593 }, { "epoch": 0.8141551257405155, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 1.9405, "step": 2594 }, { "epoch": 0.81446898662168, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 2.6221, "step": 2595 }, { "epoch": 0.8147828475028444, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 1.7334, "step": 2596 }, { "epoch": 0.8150967083840088, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 2.3404, "step": 2597 }, { "epoch": 0.8154105692651732, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 2.3539, "step": 2598 }, { "epoch": 0.8157244301463377, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 3.0796, "step": 2599 }, { "epoch": 0.816038291027502, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 2.986, "step": 2600 }, { "epoch": 0.8163521519086665, "grad_norm": 0.0810546875, "learning_rate": 0.0002, "loss": 1.1332, "step": 2601 }, { "epoch": 0.8166660127898309, "grad_norm": 0.1318359375, "learning_rate": 0.0002, "loss": 1.4435, "step": 2602 }, { "epoch": 0.8169798736709953, "grad_norm": 0.111328125, "learning_rate": 0.0002, "loss": 1.3176, "step": 2603 }, { "epoch": 0.8172937345521597, "grad_norm": 0.115234375, "learning_rate": 0.0002, "loss": 1.4673, "step": 2604 }, { "epoch": 0.8176075954333242, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.3753, "step": 2605 }, { "epoch": 0.8179214563144886, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.7531, "step": 2606 }, { "epoch": 0.818235317195653, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5415, "step": 2607 }, { "epoch": 0.8185491780768175, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.3644, "step": 2608 }, { "epoch": 0.8188630389579818, "grad_norm": 0.1201171875, "learning_rate": 0.0002, "loss": 1.5165, "step": 2609 }, { "epoch": 0.8191768998391463, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.3583, "step": 2610 }, { "epoch": 0.8194907607203107, "grad_norm": 0.1259765625, "learning_rate": 0.0002, "loss": 1.2584, "step": 2611 }, { "epoch": 0.8198046216014752, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.2903, "step": 2612 }, { "epoch": 0.8201184824826395, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.3581, "step": 2613 }, { "epoch": 0.820432343363804, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.2519, "step": 2614 }, { "epoch": 0.8207462042449685, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.2875, "step": 2615 }, { "epoch": 0.8210600651261328, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.4079, "step": 2616 }, { "epoch": 0.8213739260072973, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.6645, "step": 2617 }, { "epoch": 0.8216877868884617, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5979, "step": 2618 }, { "epoch": 0.8216877868884617, "eval_loss": 1.7699179649353027, "eval_runtime": 123.3214, "eval_samples_per_second": 8.109, "eval_steps_per_second": 8.109, "step": 2618 }, { "epoch": 0.8216877868884617, "mmlu_eval_accuracy": 0.4268902783865228, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.4375, "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, "mmlu_eval_accuracy_clinical_knowledge": 0.41379310344827586, "mmlu_eval_accuracy_college_biology": 0.1875, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.3181818181818182, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.3125, "mmlu_eval_accuracy_elementary_mathematics": 0.34146341463414637, "mmlu_eval_accuracy_formal_logic": 0.35714285714285715, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.3125, "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666, "mmlu_eval_accuracy_high_school_european_history": 0.7222222222222222, "mmlu_eval_accuracy_high_school_geography": 0.5909090909090909, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5714285714285714, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3953488372093023, "mmlu_eval_accuracy_high_school_mathematics": 0.3103448275862069, "mmlu_eval_accuracy_high_school_microeconomics": 0.19230769230769232, "mmlu_eval_accuracy_high_school_physics": 0.29411764705882354, "mmlu_eval_accuracy_high_school_psychology": 0.6333333333333333, "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, "mmlu_eval_accuracy_high_school_us_history": 0.5909090909090909, "mmlu_eval_accuracy_high_school_world_history": 0.5769230769230769, "mmlu_eval_accuracy_human_aging": 0.6956521739130435, "mmlu_eval_accuracy_human_sexuality": 0.5, "mmlu_eval_accuracy_international_law": 0.7692307692307693, "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, "mmlu_eval_accuracy_logical_fallacies": 0.6111111111111112, "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.8, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.5697674418604651, "mmlu_eval_accuracy_moral_disputes": 0.3684210526315789, "mmlu_eval_accuracy_moral_scenarios": 0.25, "mmlu_eval_accuracy_nutrition": 0.42424242424242425, "mmlu_eval_accuracy_philosophy": 0.47058823529411764, "mmlu_eval_accuracy_prehistory": 0.34285714285714286, "mmlu_eval_accuracy_professional_accounting": 0.41935483870967744, "mmlu_eval_accuracy_professional_law": 0.32941176470588235, "mmlu_eval_accuracy_professional_medicine": 0.3548387096774194, "mmlu_eval_accuracy_professional_psychology": 0.391304347826087, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.5454545454545454, "mmlu_eval_accuracy_us_foreign_policy": 0.45454545454545453, "mmlu_eval_accuracy_virology": 0.3888888888888889, "mmlu_eval_accuracy_world_religions": 0.631578947368421, "mmlu_loss": 1.2396918582534728, "step": 2618 }, { "epoch": 0.8220016477696261, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.4718, "step": 2619 }, { "epoch": 0.8223155086507905, "grad_norm": 0.212890625, "learning_rate": 0.0002, "loss": 1.6423, "step": 2620 }, { "epoch": 0.822629369531955, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.4218, "step": 2621 }, { "epoch": 0.8229432304131193, "grad_norm": 0.216796875, "learning_rate": 0.0002, "loss": 1.5827, "step": 2622 }, { "epoch": 0.8232570912942838, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 2.0448, "step": 2623 }, { "epoch": 0.8235709521754482, "grad_norm": 0.203125, "learning_rate": 0.0002, "loss": 1.4869, "step": 2624 }, { "epoch": 0.8238848130566127, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.5805, "step": 2625 }, { "epoch": 0.8241986739377771, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 1.8846, "step": 2626 }, { "epoch": 0.8245125348189415, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.9054, "step": 2627 }, { "epoch": 0.824826395700106, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 2.1049, "step": 2628 }, { "epoch": 0.8251402565812703, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 1.9548, "step": 2629 }, { "epoch": 0.8254541174624348, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 2.3645, "step": 2630 }, { "epoch": 0.8257679783435992, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 2.0293, "step": 2631 }, { "epoch": 0.8260818392247636, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 2.3448, "step": 2632 }, { "epoch": 0.826395700105928, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 2.1027, "step": 2633 }, { "epoch": 0.8267095609870925, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 1.8683, "step": 2634 }, { "epoch": 0.827023421868257, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 1.9694, "step": 2635 }, { "epoch": 0.8273372827494213, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 1.9533, "step": 2636 }, { "epoch": 0.8276511436305858, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 2.2496, "step": 2637 }, { "epoch": 0.8279650045117501, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 2.3666, "step": 2638 }, { "epoch": 0.8282788653929146, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 1.9966, "step": 2639 }, { "epoch": 0.828592726274079, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 2.1991, "step": 2640 }, { "epoch": 0.8289065871552435, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 2.2321, "step": 2641 }, { "epoch": 0.8292204480364078, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 2.1258, "step": 2642 }, { "epoch": 0.8295343089175723, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 2.6097, "step": 2643 }, { "epoch": 0.8298481697987367, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 2.277, "step": 2644 }, { "epoch": 0.8301620306799011, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 2.0178, "step": 2645 }, { "epoch": 0.8304758915610656, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 2.0701, "step": 2646 }, { "epoch": 0.83078975244223, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.6007, "step": 2647 }, { "epoch": 0.8311036133233944, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 2.1087, "step": 2648 }, { "epoch": 0.8314174742045588, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.9444, "step": 2649 }, { "epoch": 0.8317313350857233, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 2.9424, "step": 2650 }, { "epoch": 0.8320451959668876, "grad_norm": 0.091796875, "learning_rate": 0.0002, "loss": 1.3177, "step": 2651 }, { "epoch": 0.8323590568480521, "grad_norm": 0.10009765625, "learning_rate": 0.0002, "loss": 1.2663, "step": 2652 }, { "epoch": 0.8326729177292165, "grad_norm": 0.12890625, "learning_rate": 0.0002, "loss": 1.423, "step": 2653 }, { "epoch": 0.832986778610381, "grad_norm": 0.1279296875, "learning_rate": 0.0002, "loss": 1.3817, "step": 2654 }, { "epoch": 0.8333006394915454, "grad_norm": 0.12353515625, "learning_rate": 0.0002, "loss": 1.3837, "step": 2655 }, { "epoch": 0.8336145003727098, "grad_norm": 0.1171875, "learning_rate": 0.0002, "loss": 1.471, "step": 2656 }, { "epoch": 0.8339283612538743, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.3721, "step": 2657 }, { "epoch": 0.8342422221350386, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 1.5549, "step": 2658 }, { "epoch": 0.8345560830162031, "grad_norm": 0.1953125, "learning_rate": 0.0002, "loss": 1.398, "step": 2659 }, { "epoch": 0.8348699438973675, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.2919, "step": 2660 }, { "epoch": 0.8351838047785319, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5027, "step": 2661 }, { "epoch": 0.8354976656596963, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.4469, "step": 2662 }, { "epoch": 0.8358115265408608, "grad_norm": 0.125, "learning_rate": 0.0002, "loss": 1.4467, "step": 2663 }, { "epoch": 0.8361253874220251, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.5144, "step": 2664 }, { "epoch": 0.8364392483031896, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.5085, "step": 2665 }, { "epoch": 0.8367531091843541, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.3534, "step": 2666 }, { "epoch": 0.8370669700655184, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.3309, "step": 2667 }, { "epoch": 0.8373808309466829, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.3582, "step": 2668 }, { "epoch": 0.8376946918278473, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.4316, "step": 2669 }, { "epoch": 0.8380085527090118, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5175, "step": 2670 }, { "epoch": 0.8383224135901761, "grad_norm": 0.2275390625, "learning_rate": 0.0002, "loss": 1.5308, "step": 2671 }, { "epoch": 0.8386362744713406, "grad_norm": 0.2041015625, "learning_rate": 0.0002, "loss": 1.677, "step": 2672 }, { "epoch": 0.838950135352505, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.466, "step": 2673 }, { "epoch": 0.8392639962336694, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.4851, "step": 2674 }, { "epoch": 0.8395778571148339, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 1.6542, "step": 2675 }, { "epoch": 0.8398917179959983, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.6008, "step": 2676 }, { "epoch": 0.8402055788771627, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 1.5995, "step": 2677 }, { "epoch": 0.8405194397583271, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.6109, "step": 2678 }, { "epoch": 0.8408333006394916, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.9894, "step": 2679 }, { "epoch": 0.8411471615206559, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.9258, "step": 2680 }, { "epoch": 0.8414610224018204, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 2.4025, "step": 2681 }, { "epoch": 0.8417748832829848, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 2.4065, "step": 2682 }, { "epoch": 0.8420887441641492, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 2.0412, "step": 2683 }, { "epoch": 0.8424026050453136, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.6364, "step": 2684 }, { "epoch": 0.8427164659264781, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 1.8805, "step": 2685 }, { "epoch": 0.8430303268076426, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 2.0839, "step": 2686 }, { "epoch": 0.8433441876888069, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 2.1611, "step": 2687 }, { "epoch": 0.8436580485699714, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 2.2773, "step": 2688 }, { "epoch": 0.8439719094511358, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 2.18, "step": 2689 }, { "epoch": 0.8442857703323002, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 2.009, "step": 2690 }, { "epoch": 0.8445996312134646, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 2.1408, "step": 2691 }, { "epoch": 0.8449134920946291, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 2.4323, "step": 2692 }, { "epoch": 0.8452273529757934, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.6591, "step": 2693 }, { "epoch": 0.8455412138569579, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 2.1148, "step": 2694 }, { "epoch": 0.8458550747381224, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 2.1269, "step": 2695 }, { "epoch": 0.8461689356192867, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 2.169, "step": 2696 }, { "epoch": 0.8464827965004512, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 2.8229, "step": 2697 }, { "epoch": 0.8467966573816156, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 1.8605, "step": 2698 }, { "epoch": 0.84711051826278, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 2.0483, "step": 2699 }, { "epoch": 0.8474243791439444, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 2.4855, "step": 2700 }, { "epoch": 0.8477382400251089, "grad_norm": 0.10009765625, "learning_rate": 0.0002, "loss": 1.4351, "step": 2701 }, { "epoch": 0.8480521009062733, "grad_norm": 0.12255859375, "learning_rate": 0.0002, "loss": 1.3687, "step": 2702 }, { "epoch": 0.8483659617874377, "grad_norm": 0.1318359375, "learning_rate": 0.0002, "loss": 1.3651, "step": 2703 }, { "epoch": 0.8486798226686021, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5124, "step": 2704 }, { "epoch": 0.8489936835497666, "grad_norm": 0.11474609375, "learning_rate": 0.0002, "loss": 1.4039, "step": 2705 }, { "epoch": 0.849307544430931, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5525, "step": 2706 }, { "epoch": 0.8496214053120954, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.2542, "step": 2707 }, { "epoch": 0.8499352661932599, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.4129, "step": 2708 }, { "epoch": 0.8502491270744242, "grad_norm": 0.123046875, "learning_rate": 0.0002, "loss": 1.3412, "step": 2709 }, { "epoch": 0.8505629879555887, "grad_norm": 0.115234375, "learning_rate": 0.0002, "loss": 1.3434, "step": 2710 }, { "epoch": 0.8508768488367531, "grad_norm": 0.11962890625, "learning_rate": 0.0002, "loss": 1.2987, "step": 2711 }, { "epoch": 0.8511907097179175, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.4644, "step": 2712 }, { "epoch": 0.8515045705990819, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.4776, "step": 2713 }, { "epoch": 0.8518184314802464, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.4467, "step": 2714 }, { "epoch": 0.8521322923614109, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.5172, "step": 2715 }, { "epoch": 0.8524461532425752, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.4333, "step": 2716 }, { "epoch": 0.8527600141237397, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.4989, "step": 2717 }, { "epoch": 0.8530738750049041, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.4445, "step": 2718 }, { "epoch": 0.8533877358860685, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.503, "step": 2719 }, { "epoch": 0.8537015967672329, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.3651, "step": 2720 }, { "epoch": 0.8540154576483974, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.43, "step": 2721 }, { "epoch": 0.8543293185295617, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.2472, "step": 2722 }, { "epoch": 0.8546431794107262, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.4451, "step": 2723 }, { "epoch": 0.8549570402918906, "grad_norm": 0.212890625, "learning_rate": 0.0002, "loss": 1.5654, "step": 2724 }, { "epoch": 0.855270901173055, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.8145, "step": 2725 }, { "epoch": 0.8555847620542195, "grad_norm": 0.2265625, "learning_rate": 0.0002, "loss": 1.5999, "step": 2726 }, { "epoch": 0.8558986229353839, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.6916, "step": 2727 }, { "epoch": 0.8562124838165484, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.8034, "step": 2728 }, { "epoch": 0.8565263446977127, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.5139, "step": 2729 }, { "epoch": 0.8568402055788772, "grad_norm": 0.2373046875, "learning_rate": 0.0002, "loss": 1.6104, "step": 2730 }, { "epoch": 0.8571540664600416, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.9277, "step": 2731 }, { "epoch": 0.857467927341206, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.5512, "step": 2732 }, { "epoch": 0.8577817882223704, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 2.2699, "step": 2733 }, { "epoch": 0.8580956491035349, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.7613, "step": 2734 }, { "epoch": 0.8584095099846993, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 2.4218, "step": 2735 }, { "epoch": 0.8587233708658637, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 2.3356, "step": 2736 }, { "epoch": 0.8590372317470282, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 2.4203, "step": 2737 }, { "epoch": 0.8593510926281925, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 2.055, "step": 2738 }, { "epoch": 0.859664953509357, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.8217, "step": 2739 }, { "epoch": 0.8599788143905214, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 2.0541, "step": 2740 }, { "epoch": 0.8602926752716858, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 1.9936, "step": 2741 }, { "epoch": 0.8606065361528502, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 2.1029, "step": 2742 }, { "epoch": 0.8609203970340147, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 2.2554, "step": 2743 }, { "epoch": 0.861234257915179, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.9239, "step": 2744 }, { "epoch": 0.8615481187963435, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 1.6727, "step": 2745 }, { "epoch": 0.861861979677508, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.6823, "step": 2746 }, { "epoch": 0.8621758405586724, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 2.1984, "step": 2747 }, { "epoch": 0.8624897014398368, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 1.7844, "step": 2748 }, { "epoch": 0.8628035623210012, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 2.5199, "step": 2749 }, { "epoch": 0.8631174232021657, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 2.9072, "step": 2750 }, { "epoch": 0.86343128408333, "grad_norm": 0.099609375, "learning_rate": 0.0002, "loss": 1.4048, "step": 2751 }, { "epoch": 0.8637451449644945, "grad_norm": 0.11865234375, "learning_rate": 0.0002, "loss": 1.4475, "step": 2752 }, { "epoch": 0.8640590058456589, "grad_norm": 0.10791015625, "learning_rate": 0.0002, "loss": 1.3802, "step": 2753 }, { "epoch": 0.8643728667268233, "grad_norm": 0.11181640625, "learning_rate": 0.0002, "loss": 1.4085, "step": 2754 }, { "epoch": 0.8646867276079877, "grad_norm": 0.1171875, "learning_rate": 0.0002, "loss": 1.4118, "step": 2755 }, { "epoch": 0.8650005884891522, "grad_norm": 0.10693359375, "learning_rate": 0.0002, "loss": 1.3409, "step": 2756 }, { "epoch": 0.8653144493703167, "grad_norm": 0.125, "learning_rate": 0.0002, "loss": 1.442, "step": 2757 }, { "epoch": 0.865628310251481, "grad_norm": 0.11474609375, "learning_rate": 0.0002, "loss": 1.4154, "step": 2758 }, { "epoch": 0.8659421711326455, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.431, "step": 2759 }, { "epoch": 0.8662560320138099, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.4166, "step": 2760 }, { "epoch": 0.8665698928949743, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.3694, "step": 2761 }, { "epoch": 0.8668837537761387, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.3844, "step": 2762 }, { "epoch": 0.8671976146573032, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.2504, "step": 2763 }, { "epoch": 0.8675114755384675, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.3266, "step": 2764 }, { "epoch": 0.867825336419632, "grad_norm": 0.203125, "learning_rate": 0.0002, "loss": 1.5628, "step": 2765 }, { "epoch": 0.8681391973007965, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5032, "step": 2766 }, { "epoch": 0.8684530581819608, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5167, "step": 2767 }, { "epoch": 0.8687669190631253, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 1.6064, "step": 2768 }, { "epoch": 0.8690807799442897, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.5063, "step": 2769 }, { "epoch": 0.8693946408254541, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.5111, "step": 2770 }, { "epoch": 0.8697085017066185, "grad_norm": 0.197265625, "learning_rate": 0.0002, "loss": 1.5674, "step": 2771 }, { "epoch": 0.870022362587783, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.564, "step": 2772 }, { "epoch": 0.8703362234689473, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.4562, "step": 2773 }, { "epoch": 0.8706500843501118, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.8018, "step": 2774 }, { "epoch": 0.8709639452312762, "grad_norm": 0.224609375, "learning_rate": 0.0002, "loss": 1.7119, "step": 2775 }, { "epoch": 0.8712778061124407, "grad_norm": 0.244140625, "learning_rate": 0.0002, "loss": 1.6327, "step": 2776 }, { "epoch": 0.8715916669936051, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.8718, "step": 2777 }, { "epoch": 0.8719055278747695, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 1.7342, "step": 2778 }, { "epoch": 0.872219388755934, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.9018, "step": 2779 }, { "epoch": 0.8725332496370983, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.6986, "step": 2780 }, { "epoch": 0.8728471105182628, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.9193, "step": 2781 }, { "epoch": 0.8731609713994272, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 1.5808, "step": 2782 }, { "epoch": 0.8734748322805916, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 1.88, "step": 2783 }, { "epoch": 0.873788693161756, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 2.2908, "step": 2784 }, { "epoch": 0.8741025540429205, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 2.2287, "step": 2785 }, { "epoch": 0.874416414924085, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 2.5983, "step": 2786 }, { "epoch": 0.8747302758052493, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 2.2024, "step": 2787 }, { "epoch": 0.8750441366864138, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 1.9481, "step": 2788 }, { "epoch": 0.8753579975675781, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 2.2747, "step": 2789 }, { "epoch": 0.8756718584487426, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 2.1885, "step": 2790 }, { "epoch": 0.875985719329907, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 2.2836, "step": 2791 }, { "epoch": 0.8762995802110715, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 2.3704, "step": 2792 }, { "epoch": 0.8766134410922358, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 2.0737, "step": 2793 }, { "epoch": 0.8769273019734003, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 2.1677, "step": 2794 }, { "epoch": 0.8772411628545647, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.699, "step": 2795 }, { "epoch": 0.8775550237357291, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 2.4544, "step": 2796 }, { "epoch": 0.8778688846168936, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.804, "step": 2797 }, { "epoch": 0.878182745498058, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.8516, "step": 2798 }, { "epoch": 0.8784966063792224, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 2.4355, "step": 2799 }, { "epoch": 0.8788104672603868, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 2.7897, "step": 2800 }, { "epoch": 0.8791243281415513, "grad_norm": 0.1201171875, "learning_rate": 0.0002, "loss": 1.3403, "step": 2801 }, { "epoch": 0.8794381890227156, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.3172, "step": 2802 }, { "epoch": 0.8797520499038801, "grad_norm": 0.2119140625, "learning_rate": 0.0002, "loss": 1.5083, "step": 2803 }, { "epoch": 0.8800659107850445, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.4518, "step": 2804 }, { "epoch": 0.880379771666209, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.4931, "step": 2805 }, { "epoch": 0.880379771666209, "eval_loss": 1.7788866758346558, "eval_runtime": 123.2568, "eval_samples_per_second": 8.113, "eval_steps_per_second": 8.113, "step": 2805 }, { "epoch": 0.880379771666209, "mmlu_eval_accuracy": 0.4050652279837273, "mmlu_eval_accuracy_abstract_algebra": 0.09090909090909091, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, "mmlu_eval_accuracy_clinical_knowledge": 0.3793103448275862, "mmlu_eval_accuracy_college_biology": 0.25, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454, "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, "mmlu_eval_accuracy_college_medicine": 0.3181818181818182, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.2727272727272727, "mmlu_eval_accuracy_conceptual_physics": 0.34615384615384615, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.375, "mmlu_eval_accuracy_elementary_mathematics": 0.36585365853658536, "mmlu_eval_accuracy_formal_logic": 0.2857142857142857, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.28125, "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, "mmlu_eval_accuracy_high_school_computer_science": 0.4444444444444444, "mmlu_eval_accuracy_high_school_european_history": 0.7222222222222222, "mmlu_eval_accuracy_high_school_geography": 0.5454545454545454, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5714285714285714, "mmlu_eval_accuracy_high_school_macroeconomics": 0.37209302325581395, "mmlu_eval_accuracy_high_school_mathematics": 0.2413793103448276, "mmlu_eval_accuracy_high_school_microeconomics": 0.19230769230769232, "mmlu_eval_accuracy_high_school_physics": 0.29411764705882354, "mmlu_eval_accuracy_high_school_psychology": 0.5833333333333334, "mmlu_eval_accuracy_high_school_statistics": 0.34782608695652173, "mmlu_eval_accuracy_high_school_us_history": 0.5454545454545454, "mmlu_eval_accuracy_high_school_world_history": 0.4230769230769231, "mmlu_eval_accuracy_human_aging": 0.6956521739130435, "mmlu_eval_accuracy_human_sexuality": 0.4166666666666667, "mmlu_eval_accuracy_international_law": 0.6923076923076923, "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, "mmlu_eval_accuracy_logical_fallacies": 0.6666666666666666, "mmlu_eval_accuracy_machine_learning": 0.18181818181818182, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.76, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.5930232558139535, "mmlu_eval_accuracy_moral_disputes": 0.34210526315789475, "mmlu_eval_accuracy_moral_scenarios": 0.23, "mmlu_eval_accuracy_nutrition": 0.3939393939393939, "mmlu_eval_accuracy_philosophy": 0.4117647058823529, "mmlu_eval_accuracy_prehistory": 0.3142857142857143, "mmlu_eval_accuracy_professional_accounting": 0.45161290322580644, "mmlu_eval_accuracy_professional_law": 0.3352941176470588, "mmlu_eval_accuracy_professional_medicine": 0.3870967741935484, "mmlu_eval_accuracy_professional_psychology": 0.34782608695652173, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.5, "mmlu_eval_accuracy_us_foreign_policy": 0.5454545454545454, "mmlu_eval_accuracy_virology": 0.4444444444444444, "mmlu_eval_accuracy_world_religions": 0.631578947368421, "mmlu_loss": 1.0547769741829822, "step": 2805 }, { "epoch": 0.8806936325473734, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.4426, "step": 2806 }, { "epoch": 0.8810074934285378, "grad_norm": 0.11669921875, "learning_rate": 0.0002, "loss": 1.4516, "step": 2807 }, { "epoch": 0.8813213543097023, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.4083, "step": 2808 }, { "epoch": 0.8816352151908666, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.2974, "step": 2809 }, { "epoch": 0.8819490760720311, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.3967, "step": 2810 }, { "epoch": 0.8822629369531955, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.405, "step": 2811 }, { "epoch": 0.8825767978343599, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.3899, "step": 2812 }, { "epoch": 0.8828906587155243, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.2886, "step": 2813 }, { "epoch": 0.8832045195966888, "grad_norm": 0.1865234375, "learning_rate": 0.0002, "loss": 1.4096, "step": 2814 }, { "epoch": 0.8835183804778531, "grad_norm": 0.19140625, "learning_rate": 0.0002, "loss": 1.3102, "step": 2815 }, { "epoch": 0.8838322413590176, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.2996, "step": 2816 }, { "epoch": 0.8841461022401821, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.4241, "step": 2817 }, { "epoch": 0.8844599631213464, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.4082, "step": 2818 }, { "epoch": 0.8847738240025109, "grad_norm": 0.19140625, "learning_rate": 0.0002, "loss": 1.3717, "step": 2819 }, { "epoch": 0.8850876848836753, "grad_norm": 0.205078125, "learning_rate": 0.0002, "loss": 1.4103, "step": 2820 }, { "epoch": 0.8854015457648398, "grad_norm": 0.244140625, "learning_rate": 0.0002, "loss": 1.5948, "step": 2821 }, { "epoch": 0.8857154066460041, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.4344, "step": 2822 }, { "epoch": 0.8860292675271686, "grad_norm": 0.205078125, "learning_rate": 0.0002, "loss": 1.5446, "step": 2823 }, { "epoch": 0.886343128408333, "grad_norm": 0.2138671875, "learning_rate": 0.0002, "loss": 1.6508, "step": 2824 }, { "epoch": 0.8866569892894974, "grad_norm": 0.1982421875, "learning_rate": 0.0002, "loss": 1.4263, "step": 2825 }, { "epoch": 0.8869708501706619, "grad_norm": 0.2109375, "learning_rate": 0.0002, "loss": 1.5295, "step": 2826 }, { "epoch": 0.8872847110518263, "grad_norm": 0.21484375, "learning_rate": 0.0002, "loss": 1.4279, "step": 2827 }, { "epoch": 0.8875985719329907, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.6395, "step": 2828 }, { "epoch": 0.8879124328141551, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.6521, "step": 2829 }, { "epoch": 0.8882262936953196, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.7628, "step": 2830 }, { "epoch": 0.8885401545764839, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 1.9453, "step": 2831 }, { "epoch": 0.8888540154576484, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 2.0855, "step": 2832 }, { "epoch": 0.8891678763388128, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 2.3188, "step": 2833 }, { "epoch": 0.8894817372199773, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 1.9983, "step": 2834 }, { "epoch": 0.8897955981011416, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 2.6919, "step": 2835 }, { "epoch": 0.8901094589823061, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 2.2802, "step": 2836 }, { "epoch": 0.8904233198634706, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 1.7236, "step": 2837 }, { "epoch": 0.8907371807446349, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 2.1315, "step": 2838 }, { "epoch": 0.8910510416257994, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 2.3496, "step": 2839 }, { "epoch": 0.8913649025069638, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 2.5181, "step": 2840 }, { "epoch": 0.8916787633881282, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 2.4695, "step": 2841 }, { "epoch": 0.8919926242692926, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 2.0334, "step": 2842 }, { "epoch": 0.8923064851504571, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 2.2868, "step": 2843 }, { "epoch": 0.8926203460316214, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 1.9286, "step": 2844 }, { "epoch": 0.8929342069127859, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 2.8997, "step": 2845 }, { "epoch": 0.8932480677939504, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 1.7818, "step": 2846 }, { "epoch": 0.8935619286751147, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 2.3554, "step": 2847 }, { "epoch": 0.8938757895562792, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.9271, "step": 2848 }, { "epoch": 0.8941896504374436, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 2.0679, "step": 2849 }, { "epoch": 0.8945035113186081, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 2.6269, "step": 2850 }, { "epoch": 0.8948173721997724, "grad_norm": 0.119140625, "learning_rate": 0.0002, "loss": 1.3056, "step": 2851 }, { "epoch": 0.8951312330809369, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.2266, "step": 2852 }, { "epoch": 0.8954450939621013, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.4075, "step": 2853 }, { "epoch": 0.8957589548432657, "grad_norm": 0.1923828125, "learning_rate": 0.0002, "loss": 1.4304, "step": 2854 }, { "epoch": 0.8960728157244301, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.4618, "step": 2855 }, { "epoch": 0.8963866766055946, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.4074, "step": 2856 }, { "epoch": 0.896700537486759, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.5807, "step": 2857 }, { "epoch": 0.8970143983679234, "grad_norm": 0.1083984375, "learning_rate": 0.0002, "loss": 1.232, "step": 2858 }, { "epoch": 0.8973282592490879, "grad_norm": 0.1171875, "learning_rate": 0.0002, "loss": 1.4562, "step": 2859 }, { "epoch": 0.8976421201302522, "grad_norm": 0.125, "learning_rate": 0.0002, "loss": 1.393, "step": 2860 }, { "epoch": 0.8979559810114167, "grad_norm": 0.2158203125, "learning_rate": 0.0002, "loss": 1.5262, "step": 2861 }, { "epoch": 0.8982698418925811, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.4298, "step": 2862 }, { "epoch": 0.8985837027737456, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.4761, "step": 2863 }, { "epoch": 0.8988975636549099, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.4818, "step": 2864 }, { "epoch": 0.8992114245360744, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5365, "step": 2865 }, { "epoch": 0.8995252854172389, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.2675, "step": 2866 }, { "epoch": 0.8998391462984032, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.3837, "step": 2867 }, { "epoch": 0.9001530071795677, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.4809, "step": 2868 }, { "epoch": 0.9004668680607321, "grad_norm": 0.193359375, "learning_rate": 0.0002, "loss": 1.6493, "step": 2869 }, { "epoch": 0.9007807289418965, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5541, "step": 2870 }, { "epoch": 0.9010945898230609, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.638, "step": 2871 }, { "epoch": 0.9014084507042254, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 1.6448, "step": 2872 }, { "epoch": 0.9017223115853897, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.6104, "step": 2873 }, { "epoch": 0.9020361724665542, "grad_norm": 0.1845703125, "learning_rate": 0.0002, "loss": 1.3193, "step": 2874 }, { "epoch": 0.9023500333477186, "grad_norm": 0.2255859375, "learning_rate": 0.0002, "loss": 1.5697, "step": 2875 }, { "epoch": 0.902663894228883, "grad_norm": 0.2451171875, "learning_rate": 0.0002, "loss": 1.5283, "step": 2876 }, { "epoch": 0.9029777551100475, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.7855, "step": 2877 }, { "epoch": 0.9032916159912119, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 1.8568, "step": 2878 }, { "epoch": 0.9036054768723764, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 2.3182, "step": 2879 }, { "epoch": 0.9039193377535407, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 2.2042, "step": 2880 }, { "epoch": 0.9042331986347052, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.8882, "step": 2881 }, { "epoch": 0.9045470595158696, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 2.2443, "step": 2882 }, { "epoch": 0.904860920397034, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.9831, "step": 2883 }, { "epoch": 0.9051747812781984, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.7611, "step": 2884 }, { "epoch": 0.9054886421593629, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 2.2978, "step": 2885 }, { "epoch": 0.9058025030405273, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 2.4272, "step": 2886 }, { "epoch": 0.9061163639216917, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 2.3842, "step": 2887 }, { "epoch": 0.9064302248028562, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 2.2893, "step": 2888 }, { "epoch": 0.9067440856840205, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 2.8636, "step": 2889 }, { "epoch": 0.907057946565185, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 2.3717, "step": 2890 }, { "epoch": 0.9073718074463494, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 2.5191, "step": 2891 }, { "epoch": 0.9076856683275139, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 2.3966, "step": 2892 }, { "epoch": 0.9079995292086782, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 2.0948, "step": 2893 }, { "epoch": 0.9083133900898427, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 1.9643, "step": 2894 }, { "epoch": 0.908627250971007, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 1.7443, "step": 2895 }, { "epoch": 0.9089411118521715, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 2.1531, "step": 2896 }, { "epoch": 0.909254972733336, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 2.0995, "step": 2897 }, { "epoch": 0.9095688336145004, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 2.1549, "step": 2898 }, { "epoch": 0.9098826944956648, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 2.6941, "step": 2899 }, { "epoch": 0.9101965553768292, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 2.8947, "step": 2900 }, { "epoch": 0.9105104162579937, "grad_norm": 0.09033203125, "learning_rate": 0.0002, "loss": 1.2146, "step": 2901 }, { "epoch": 0.910824277139158, "grad_norm": 0.12109375, "learning_rate": 0.0002, "loss": 1.3977, "step": 2902 }, { "epoch": 0.9111381380203225, "grad_norm": 0.11962890625, "learning_rate": 0.0002, "loss": 1.3151, "step": 2903 }, { "epoch": 0.9114519989014869, "grad_norm": 0.1259765625, "learning_rate": 0.0002, "loss": 1.5023, "step": 2904 }, { "epoch": 0.9117658597826513, "grad_norm": 0.11962890625, "learning_rate": 0.0002, "loss": 1.2814, "step": 2905 }, { "epoch": 0.9120797206638158, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.487, "step": 2906 }, { "epoch": 0.9123935815449802, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.4147, "step": 2907 }, { "epoch": 0.9127074424261447, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.6928, "step": 2908 }, { "epoch": 0.913021303307309, "grad_norm": 0.1328125, "learning_rate": 0.0002, "loss": 1.4939, "step": 2909 }, { "epoch": 0.9133351641884735, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.4947, "step": 2910 }, { "epoch": 0.9136490250696379, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.444, "step": 2911 }, { "epoch": 0.9139628859508023, "grad_norm": 0.1240234375, "learning_rate": 0.0002, "loss": 1.4336, "step": 2912 }, { "epoch": 0.9142767468319667, "grad_norm": 0.126953125, "learning_rate": 0.0002, "loss": 1.3835, "step": 2913 }, { "epoch": 0.9145906077131312, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.4232, "step": 2914 }, { "epoch": 0.9149044685942955, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.4444, "step": 2915 }, { "epoch": 0.91521832947546, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.5787, "step": 2916 }, { "epoch": 0.9155321903566245, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.2689, "step": 2917 }, { "epoch": 0.9158460512377888, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.3692, "step": 2918 }, { "epoch": 0.9161599121189533, "grad_norm": 0.2060546875, "learning_rate": 0.0002, "loss": 1.5798, "step": 2919 }, { "epoch": 0.9164737730001177, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.6355, "step": 2920 }, { "epoch": 0.9167876338812821, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.5006, "step": 2921 }, { "epoch": 0.9171014947624465, "grad_norm": 0.2255859375, "learning_rate": 0.0002, "loss": 1.448, "step": 2922 }, { "epoch": 0.917415355643611, "grad_norm": 0.2216796875, "learning_rate": 0.0002, "loss": 1.6705, "step": 2923 }, { "epoch": 0.9177292165247753, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 1.6719, "step": 2924 }, { "epoch": 0.9180430774059398, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 1.5722, "step": 2925 }, { "epoch": 0.9183569382871043, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 1.9757, "step": 2926 }, { "epoch": 0.9186707991682687, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 1.6939, "step": 2927 }, { "epoch": 0.9189846600494331, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.8072, "step": 2928 }, { "epoch": 0.9192985209305975, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.9631, "step": 2929 }, { "epoch": 0.919612381811762, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 2.1459, "step": 2930 }, { "epoch": 0.9199262426929263, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.8304, "step": 2931 }, { "epoch": 0.9202401035740908, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 2.2797, "step": 2932 }, { "epoch": 0.9205539644552552, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 2.0257, "step": 2933 }, { "epoch": 0.9208678253364196, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 2.0304, "step": 2934 }, { "epoch": 0.921181686217584, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 2.7163, "step": 2935 }, { "epoch": 0.9214955470987485, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 2.1801, "step": 2936 }, { "epoch": 0.921809407979913, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 2.1863, "step": 2937 }, { "epoch": 0.9221232688610773, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.7344, "step": 2938 }, { "epoch": 0.9224371297422418, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 2.1153, "step": 2939 }, { "epoch": 0.9227509906234062, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 2.1959, "step": 2940 }, { "epoch": 0.9230648515045706, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 2.5984, "step": 2941 }, { "epoch": 0.923378712385735, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 2.3594, "step": 2942 }, { "epoch": 0.9236925732668995, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 2.1415, "step": 2943 }, { "epoch": 0.9240064341480638, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 2.2361, "step": 2944 }, { "epoch": 0.9243202950292283, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.9417, "step": 2945 }, { "epoch": 0.9246341559103927, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.7594, "step": 2946 }, { "epoch": 0.9249480167915571, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 2.1687, "step": 2947 }, { "epoch": 0.9252618776727216, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 2.1085, "step": 2948 }, { "epoch": 0.925575738553886, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 1.9088, "step": 2949 }, { "epoch": 0.9258895994350504, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 2.7049, "step": 2950 }, { "epoch": 0.9262034603162148, "grad_norm": 0.0732421875, "learning_rate": 0.0002, "loss": 1.3151, "step": 2951 }, { "epoch": 0.9265173211973793, "grad_norm": 0.09521484375, "learning_rate": 0.0002, "loss": 1.2594, "step": 2952 }, { "epoch": 0.9268311820785436, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.3481, "step": 2953 }, { "epoch": 0.9271450429597081, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.3905, "step": 2954 }, { "epoch": 0.9274589038408725, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.6019, "step": 2955 }, { "epoch": 0.927772764722037, "grad_norm": 0.119140625, "learning_rate": 0.0002, "loss": 1.452, "step": 2956 }, { "epoch": 0.9280866256032014, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.4319, "step": 2957 }, { "epoch": 0.9284004864843658, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.509, "step": 2958 }, { "epoch": 0.9287143473655303, "grad_norm": 0.1103515625, "learning_rate": 0.0002, "loss": 1.3817, "step": 2959 }, { "epoch": 0.9290282082466946, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.476, "step": 2960 }, { "epoch": 0.9293420691278591, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.5592, "step": 2961 }, { "epoch": 0.9296559300090235, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.4542, "step": 2962 }, { "epoch": 0.9299697908901879, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5691, "step": 2963 }, { "epoch": 0.9302836517713523, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.4798, "step": 2964 }, { "epoch": 0.9305975126525168, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.436, "step": 2965 }, { "epoch": 0.9309113735336811, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.4208, "step": 2966 }, { "epoch": 0.9312252344148456, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.3743, "step": 2967 }, { "epoch": 0.9315390952960101, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.3202, "step": 2968 }, { "epoch": 0.9318529561771745, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.3094, "step": 2969 }, { "epoch": 0.9321668170583389, "grad_norm": 0.197265625, "learning_rate": 0.0002, "loss": 1.4133, "step": 2970 }, { "epoch": 0.9324806779395033, "grad_norm": 0.2060546875, "learning_rate": 0.0002, "loss": 1.4211, "step": 2971 }, { "epoch": 0.9327945388206678, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.3897, "step": 2972 }, { "epoch": 0.9331083997018321, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.5081, "step": 2973 }, { "epoch": 0.9334222605829966, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.4796, "step": 2974 }, { "epoch": 0.933736121464161, "grad_norm": 0.2109375, "learning_rate": 0.0002, "loss": 1.4619, "step": 2975 }, { "epoch": 0.9340499823453254, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.7486, "step": 2976 }, { "epoch": 0.9343638432264899, "grad_norm": 0.2216796875, "learning_rate": 0.0002, "loss": 1.5938, "step": 2977 }, { "epoch": 0.9346777041076543, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 1.7117, "step": 2978 }, { "epoch": 0.9349915649888187, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 2.0723, "step": 2979 }, { "epoch": 0.9353054258699831, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 1.6665, "step": 2980 }, { "epoch": 0.9356192867511476, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.8341, "step": 2981 }, { "epoch": 0.935933147632312, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.6164, "step": 2982 }, { "epoch": 0.9362470085134764, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 2.1117, "step": 2983 }, { "epoch": 0.9365608693946408, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 2.2635, "step": 2984 }, { "epoch": 0.9368747302758053, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 2.1706, "step": 2985 }, { "epoch": 0.9371885911569696, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.7411, "step": 2986 }, { "epoch": 0.9375024520381341, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 2.133, "step": 2987 }, { "epoch": 0.9378163129192986, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 2.2699, "step": 2988 }, { "epoch": 0.9381301738004629, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 2.3278, "step": 2989 }, { "epoch": 0.9384440346816274, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.8978, "step": 2990 }, { "epoch": 0.9387578955627918, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 2.4348, "step": 2991 }, { "epoch": 0.9390717564439562, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 2.2424, "step": 2992 }, { "epoch": 0.9390717564439562, "eval_loss": 1.764998435974121, "eval_runtime": 123.5438, "eval_samples_per_second": 8.094, "eval_steps_per_second": 8.094, "step": 2992 }, { "epoch": 0.9390717564439562, "mmlu_eval_accuracy": 0.4093842500541707, "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.4375, "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, "mmlu_eval_accuracy_clinical_knowledge": 0.41379310344827586, "mmlu_eval_accuracy_college_biology": 0.1875, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454, "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.2727272727272727, "mmlu_eval_accuracy_conceptual_physics": 0.34615384615384615, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.5, "mmlu_eval_accuracy_elementary_mathematics": 0.3170731707317073, "mmlu_eval_accuracy_formal_logic": 0.2857142857142857, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.28125, "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, "mmlu_eval_accuracy_high_school_computer_science": 0.4444444444444444, "mmlu_eval_accuracy_high_school_european_history": 0.6666666666666666, "mmlu_eval_accuracy_high_school_geography": 0.5454545454545454, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5714285714285714, "mmlu_eval_accuracy_high_school_macroeconomics": 0.32558139534883723, "mmlu_eval_accuracy_high_school_mathematics": 0.2413793103448276, "mmlu_eval_accuracy_high_school_microeconomics": 0.2692307692307692, "mmlu_eval_accuracy_high_school_physics": 0.29411764705882354, "mmlu_eval_accuracy_high_school_psychology": 0.6, "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, "mmlu_eval_accuracy_high_school_us_history": 0.5454545454545454, "mmlu_eval_accuracy_high_school_world_history": 0.46153846153846156, "mmlu_eval_accuracy_human_aging": 0.6956521739130435, "mmlu_eval_accuracy_human_sexuality": 0.4166666666666667, "mmlu_eval_accuracy_international_law": 0.6923076923076923, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.6111111111111112, "mmlu_eval_accuracy_machine_learning": 0.18181818181818182, "mmlu_eval_accuracy_management": 0.45454545454545453, "mmlu_eval_accuracy_marketing": 0.72, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.5930232558139535, "mmlu_eval_accuracy_moral_disputes": 0.3684210526315789, "mmlu_eval_accuracy_moral_scenarios": 0.27, "mmlu_eval_accuracy_nutrition": 0.42424242424242425, "mmlu_eval_accuracy_philosophy": 0.4411764705882353, "mmlu_eval_accuracy_prehistory": 0.34285714285714286, "mmlu_eval_accuracy_professional_accounting": 0.3870967741935484, "mmlu_eval_accuracy_professional_law": 0.3352941176470588, "mmlu_eval_accuracy_professional_medicine": 0.4838709677419355, "mmlu_eval_accuracy_professional_psychology": 0.3333333333333333, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.4074074074074074, "mmlu_eval_accuracy_sociology": 0.5454545454545454, "mmlu_eval_accuracy_us_foreign_policy": 0.45454545454545453, "mmlu_eval_accuracy_virology": 0.4444444444444444, "mmlu_eval_accuracy_world_religions": 0.6842105263157895, "mmlu_loss": 1.226706854507551, "step": 2992 }, { "epoch": 0.9393856173251206, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 1.8017, "step": 2993 }, { "epoch": 0.9396994782062851, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 2.5615, "step": 2994 }, { "epoch": 0.9400133390874494, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 2.2336, "step": 2995 }, { "epoch": 0.9403271999686139, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 2.3521, "step": 2996 }, { "epoch": 0.9406410608497784, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 2.755, "step": 2997 }, { "epoch": 0.9409549217309428, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 2.2811, "step": 2998 }, { "epoch": 0.9412687826121072, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 2.4054, "step": 2999 }, { "epoch": 0.9415826434932716, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 3.1927, "step": 3000 }, { "epoch": 0.9418965043744361, "grad_norm": 0.0830078125, "learning_rate": 0.0002, "loss": 1.4615, "step": 3001 }, { "epoch": 0.9422103652556004, "grad_norm": 0.10791015625, "learning_rate": 0.0002, "loss": 1.4297, "step": 3002 }, { "epoch": 0.9425242261367649, "grad_norm": 0.0947265625, "learning_rate": 0.0002, "loss": 1.3411, "step": 3003 }, { "epoch": 0.9428380870179293, "grad_norm": 0.1005859375, "learning_rate": 0.0002, "loss": 1.3809, "step": 3004 }, { "epoch": 0.9431519478990937, "grad_norm": 0.12353515625, "learning_rate": 0.0002, "loss": 1.5347, "step": 3005 }, { "epoch": 0.9434658087802581, "grad_norm": 0.11572265625, "learning_rate": 0.0002, "loss": 1.256, "step": 3006 }, { "epoch": 0.9437796696614226, "grad_norm": 0.1162109375, "learning_rate": 0.0002, "loss": 1.4369, "step": 3007 }, { "epoch": 0.944093530542587, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.3979, "step": 3008 }, { "epoch": 0.9444073914237514, "grad_norm": 0.126953125, "learning_rate": 0.0002, "loss": 1.3387, "step": 3009 }, { "epoch": 0.9447212523049159, "grad_norm": 0.126953125, "learning_rate": 0.0002, "loss": 1.5077, "step": 3010 }, { "epoch": 0.9450351131860802, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.442, "step": 3011 }, { "epoch": 0.9453489740672447, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5246, "step": 3012 }, { "epoch": 0.9456628349484091, "grad_norm": 0.1298828125, "learning_rate": 0.0002, "loss": 1.3485, "step": 3013 }, { "epoch": 0.9459766958295736, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.4025, "step": 3014 }, { "epoch": 0.9462905567107379, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.4433, "step": 3015 }, { "epoch": 0.9466044175919024, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.3678, "step": 3016 }, { "epoch": 0.9469182784730669, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.6643, "step": 3017 }, { "epoch": 0.9472321393542312, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.4615, "step": 3018 }, { "epoch": 0.9475460002353957, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.5222, "step": 3019 }, { "epoch": 0.9478598611165601, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.4337, "step": 3020 }, { "epoch": 0.9481737219977245, "grad_norm": 0.189453125, "learning_rate": 0.0002, "loss": 1.5281, "step": 3021 }, { "epoch": 0.9484875828788889, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.4598, "step": 3022 }, { "epoch": 0.9488014437600534, "grad_norm": 0.216796875, "learning_rate": 0.0002, "loss": 1.6235, "step": 3023 }, { "epoch": 0.9491153046412177, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5128, "step": 3024 }, { "epoch": 0.9494291655223822, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.8316, "step": 3025 }, { "epoch": 0.9497430264035466, "grad_norm": 0.19140625, "learning_rate": 0.0002, "loss": 1.6191, "step": 3026 }, { "epoch": 0.950056887284711, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.7582, "step": 3027 }, { "epoch": 0.9503707481658755, "grad_norm": 0.228515625, "learning_rate": 0.0002, "loss": 1.7131, "step": 3028 }, { "epoch": 0.9506846090470399, "grad_norm": 0.2470703125, "learning_rate": 0.0002, "loss": 1.5641, "step": 3029 }, { "epoch": 0.9509984699282044, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.9696, "step": 3030 }, { "epoch": 0.9513123308093687, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 2.2189, "step": 3031 }, { "epoch": 0.9516261916905332, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.7686, "step": 3032 }, { "epoch": 0.9519400525716976, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.8801, "step": 3033 }, { "epoch": 0.952253913452862, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 1.7235, "step": 3034 }, { "epoch": 0.9525677743340264, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.9111, "step": 3035 }, { "epoch": 0.9528816352151909, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 2.1576, "step": 3036 }, { "epoch": 0.9531954960963553, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 2.4213, "step": 3037 }, { "epoch": 0.9535093569775197, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 2.5938, "step": 3038 }, { "epoch": 0.9538232178586842, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 2.451, "step": 3039 }, { "epoch": 0.9541370787398485, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 2.981, "step": 3040 }, { "epoch": 0.954450939621013, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 2.1268, "step": 3041 }, { "epoch": 0.9547648005021774, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 2.7128, "step": 3042 }, { "epoch": 0.9550786613833419, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 1.953, "step": 3043 }, { "epoch": 0.9553925222645062, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 2.6608, "step": 3044 }, { "epoch": 0.9557063831456707, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 2.1209, "step": 3045 }, { "epoch": 0.956020244026835, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 2.1233, "step": 3046 }, { "epoch": 0.9563341049079995, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 1.8969, "step": 3047 }, { "epoch": 0.956647965789164, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 1.8317, "step": 3048 }, { "epoch": 0.9569618266703284, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.983, "step": 3049 }, { "epoch": 0.9572756875514928, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 2.5904, "step": 3050 }, { "epoch": 0.9575895484326572, "grad_norm": 0.10107421875, "learning_rate": 0.0002, "loss": 1.3929, "step": 3051 }, { "epoch": 0.9579034093138217, "grad_norm": 0.125, "learning_rate": 0.0002, "loss": 1.3647, "step": 3052 }, { "epoch": 0.958217270194986, "grad_norm": 0.11328125, "learning_rate": 0.0002, "loss": 1.3026, "step": 3053 }, { "epoch": 0.9585311310761505, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.513, "step": 3054 }, { "epoch": 0.9588449919573149, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5286, "step": 3055 }, { "epoch": 0.9591588528384793, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.427, "step": 3056 }, { "epoch": 0.9594727137196438, "grad_norm": 0.1318359375, "learning_rate": 0.0002, "loss": 1.4327, "step": 3057 }, { "epoch": 0.9597865746008082, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.5289, "step": 3058 }, { "epoch": 0.9601004354819727, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5179, "step": 3059 }, { "epoch": 0.960414296363137, "grad_norm": 0.1279296875, "learning_rate": 0.0002, "loss": 1.3738, "step": 3060 }, { "epoch": 0.9607281572443015, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.4025, "step": 3061 }, { "epoch": 0.9610420181254659, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.5099, "step": 3062 }, { "epoch": 0.9613558790066303, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.4911, "step": 3063 }, { "epoch": 0.9616697398877947, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.5292, "step": 3064 }, { "epoch": 0.9619836007689592, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.347, "step": 3065 }, { "epoch": 0.9622974616501235, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5005, "step": 3066 }, { "epoch": 0.962611322531288, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.3431, "step": 3067 }, { "epoch": 0.9629251834124525, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.4965, "step": 3068 }, { "epoch": 0.9632390442936168, "grad_norm": 0.22265625, "learning_rate": 0.0002, "loss": 1.5808, "step": 3069 }, { "epoch": 0.9635529051747813, "grad_norm": 0.197265625, "learning_rate": 0.0002, "loss": 1.7133, "step": 3070 }, { "epoch": 0.9638667660559457, "grad_norm": 0.2392578125, "learning_rate": 0.0002, "loss": 1.2565, "step": 3071 }, { "epoch": 0.9641806269371102, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.4294, "step": 3072 }, { "epoch": 0.9644944878182745, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.5612, "step": 3073 }, { "epoch": 0.964808348699439, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.6573, "step": 3074 }, { "epoch": 0.9651222095806034, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.8696, "step": 3075 }, { "epoch": 0.9654360704617678, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.6256, "step": 3076 }, { "epoch": 0.9657499313429323, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 1.7176, "step": 3077 }, { "epoch": 0.9660637922240967, "grad_norm": 0.2451171875, "learning_rate": 0.0002, "loss": 1.5864, "step": 3078 }, { "epoch": 0.9663776531052611, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.4329, "step": 3079 }, { "epoch": 0.9666915139864255, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 2.2613, "step": 3080 }, { "epoch": 0.96700537486759, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 2.0, "step": 3081 }, { "epoch": 0.9673192357487543, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.8663, "step": 3082 }, { "epoch": 0.9676330966299188, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 2.303, "step": 3083 }, { "epoch": 0.9679469575110832, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 2.1145, "step": 3084 }, { "epoch": 0.9682608183922476, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 1.9761, "step": 3085 }, { "epoch": 0.968574679273412, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 2.0674, "step": 3086 }, { "epoch": 0.9688885401545765, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 2.1535, "step": 3087 }, { "epoch": 0.969202401035741, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 2.2142, "step": 3088 }, { "epoch": 0.9695162619169053, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 2.1012, "step": 3089 }, { "epoch": 0.9698301227980698, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 2.2116, "step": 3090 }, { "epoch": 0.9701439836792342, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 2.1366, "step": 3091 }, { "epoch": 0.9704578445603986, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 2.5627, "step": 3092 }, { "epoch": 0.970771705441563, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 2.1147, "step": 3093 }, { "epoch": 0.9710855663227275, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 1.7821, "step": 3094 }, { "epoch": 0.9713994272038918, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 2.6043, "step": 3095 }, { "epoch": 0.9717132880850563, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 2.2847, "step": 3096 }, { "epoch": 0.9720271489662208, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 2.2498, "step": 3097 }, { "epoch": 0.9723410098473851, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 2.1608, "step": 3098 }, { "epoch": 0.9726548707285496, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 2.4847, "step": 3099 }, { "epoch": 0.972968731609714, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 2.3834, "step": 3100 }, { "epoch": 0.9732825924908785, "grad_norm": 0.0869140625, "learning_rate": 0.0002, "loss": 1.3037, "step": 3101 }, { "epoch": 0.9735964533720428, "grad_norm": 0.11767578125, "learning_rate": 0.0002, "loss": 1.4379, "step": 3102 }, { "epoch": 0.9739103142532073, "grad_norm": 0.11181640625, "learning_rate": 0.0002, "loss": 1.3402, "step": 3103 }, { "epoch": 0.9742241751343717, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.3735, "step": 3104 }, { "epoch": 0.9745380360155361, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.376, "step": 3105 }, { "epoch": 0.9748518968967005, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.5283, "step": 3106 }, { "epoch": 0.975165757777865, "grad_norm": 0.12890625, "learning_rate": 0.0002, "loss": 1.5694, "step": 3107 }, { "epoch": 0.9754796186590294, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.334, "step": 3108 }, { "epoch": 0.9757934795401938, "grad_norm": 0.12109375, "learning_rate": 0.0002, "loss": 1.528, "step": 3109 }, { "epoch": 0.9761073404213583, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.52, "step": 3110 }, { "epoch": 0.9764212013025226, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.4252, "step": 3111 }, { "epoch": 0.9767350621836871, "grad_norm": 0.11572265625, "learning_rate": 0.0002, "loss": 1.3186, "step": 3112 }, { "epoch": 0.9770489230648515, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.6066, "step": 3113 }, { "epoch": 0.977362783946016, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.544, "step": 3114 }, { "epoch": 0.9776766448271803, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.4584, "step": 3115 }, { "epoch": 0.9779905057083448, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.3325, "step": 3116 }, { "epoch": 0.9783043665895093, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.3468, "step": 3117 }, { "epoch": 0.9786182274706736, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.4411, "step": 3118 }, { "epoch": 0.9789320883518381, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.2572, "step": 3119 }, { "epoch": 0.9792459492330025, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 1.4013, "step": 3120 }, { "epoch": 0.9795598101141669, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.4012, "step": 3121 }, { "epoch": 0.9798736709953313, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.4045, "step": 3122 }, { "epoch": 0.9801875318764958, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.1227, "step": 3123 }, { "epoch": 0.9805013927576601, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.6554, "step": 3124 }, { "epoch": 0.9808152536388246, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 1.5291, "step": 3125 }, { "epoch": 0.981129114519989, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 1.8137, "step": 3126 }, { "epoch": 0.9814429754011534, "grad_norm": 0.232421875, "learning_rate": 0.0002, "loss": 1.5328, "step": 3127 }, { "epoch": 0.9817568362823179, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.8186, "step": 3128 }, { "epoch": 0.9820706971634823, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 2.1706, "step": 3129 }, { "epoch": 0.9823845580446467, "grad_norm": 0.2412109375, "learning_rate": 0.0002, "loss": 1.6254, "step": 3130 }, { "epoch": 0.9826984189258111, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 2.0098, "step": 3131 }, { "epoch": 0.9830122798069756, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 1.9764, "step": 3132 }, { "epoch": 0.98332614068814, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 2.1314, "step": 3133 }, { "epoch": 0.9836400015693044, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 2.2194, "step": 3134 }, { "epoch": 0.9839538624504688, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 2.5701, "step": 3135 }, { "epoch": 0.9842677233316333, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 2.304, "step": 3136 }, { "epoch": 0.9845815842127976, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 2.1723, "step": 3137 }, { "epoch": 0.9848954450939621, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 2.8468, "step": 3138 }, { "epoch": 0.9852093059751266, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 2.6077, "step": 3139 }, { "epoch": 0.9855231668562909, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 2.6458, "step": 3140 }, { "epoch": 0.9858370277374554, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 2.2956, "step": 3141 }, { "epoch": 0.9861508886186198, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 2.2138, "step": 3142 }, { "epoch": 0.9864647494997842, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 2.1963, "step": 3143 }, { "epoch": 0.9867786103809486, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 1.9578, "step": 3144 }, { "epoch": 0.9870924712621131, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 2.2266, "step": 3145 }, { "epoch": 0.9874063321432774, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 1.6495, "step": 3146 }, { "epoch": 0.9877201930244419, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 2.0621, "step": 3147 }, { "epoch": 0.9880340539056064, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 2.6402, "step": 3148 }, { "epoch": 0.9883479147867708, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 2.4478, "step": 3149 }, { "epoch": 0.9886617756679352, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 2.7114, "step": 3150 }, { "epoch": 0.9889756365490996, "grad_norm": 0.09130859375, "learning_rate": 0.0002, "loss": 1.3267, "step": 3151 }, { "epoch": 0.9892894974302641, "grad_norm": 0.11865234375, "learning_rate": 0.0002, "loss": 1.333, "step": 3152 }, { "epoch": 0.9896033583114284, "grad_norm": 0.1279296875, "learning_rate": 0.0002, "loss": 1.359, "step": 3153 }, { "epoch": 0.9899172191925929, "grad_norm": 0.12255859375, "learning_rate": 0.0002, "loss": 1.2814, "step": 3154 }, { "epoch": 0.9902310800737573, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.3078, "step": 3155 }, { "epoch": 0.9905449409549217, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5243, "step": 3156 }, { "epoch": 0.9908588018360861, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.4939, "step": 3157 }, { "epoch": 0.9911726627172506, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.5222, "step": 3158 }, { "epoch": 0.991486523598415, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.5443, "step": 3159 }, { "epoch": 0.9918003844795794, "grad_norm": 0.212890625, "learning_rate": 0.0002, "loss": 1.4496, "step": 3160 }, { "epoch": 0.9921142453607439, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.4076, "step": 3161 }, { "epoch": 0.9924281062419082, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.387, "step": 3162 }, { "epoch": 0.9927419671230727, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.4078, "step": 3163 }, { "epoch": 0.9930558280042371, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.4414, "step": 3164 }, { "epoch": 0.9933696888854016, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.526, "step": 3165 }, { "epoch": 0.9936835497665659, "grad_norm": 0.1943359375, "learning_rate": 0.0002, "loss": 1.6098, "step": 3166 }, { "epoch": 0.9939974106477304, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.6074, "step": 3167 }, { "epoch": 0.9943112715288949, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.4543, "step": 3168 }, { "epoch": 0.9946251324100592, "grad_norm": 0.240234375, "learning_rate": 0.0002, "loss": 1.5867, "step": 3169 }, { "epoch": 0.9949389932912237, "grad_norm": 0.1982421875, "learning_rate": 0.0002, "loss": 1.5843, "step": 3170 }, { "epoch": 0.9952528541723881, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 2.0918, "step": 3171 }, { "epoch": 0.9955667150535525, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 2.0152, "step": 3172 }, { "epoch": 0.9958805759347169, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.5978, "step": 3173 }, { "epoch": 0.9961944368158814, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 2.2455, "step": 3174 }, { "epoch": 0.9965082976970457, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 2.0231, "step": 3175 }, { "epoch": 0.9968221585782102, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 1.9791, "step": 3176 }, { "epoch": 0.9971360194593746, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 2.3723, "step": 3177 }, { "epoch": 0.997449880340539, "grad_norm": 0.87109375, "learning_rate": 0.0002, "loss": 2.5541, "step": 3178 }, { "epoch": 0.9977637412217035, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 2.4854, "step": 3179 }, { "epoch": 0.9977637412217035, "eval_loss": 1.7612242698669434, "eval_runtime": 122.9212, "eval_samples_per_second": 8.135, "eval_steps_per_second": 8.135, "step": 3179 }, { "epoch": 0.9977637412217035, "mmlu_eval_accuracy": 0.4094543774249976, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.35714285714285715, "mmlu_eval_accuracy_astronomy": 0.4375, "mmlu_eval_accuracy_business_ethics": 0.6363636363636364, "mmlu_eval_accuracy_clinical_knowledge": 0.3448275862068966, "mmlu_eval_accuracy_college_biology": 0.1875, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454, "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, "mmlu_eval_accuracy_college_medicine": 0.3181818181818182, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.2727272727272727, "mmlu_eval_accuracy_conceptual_physics": 0.34615384615384615, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.4375, "mmlu_eval_accuracy_elementary_mathematics": 0.3170731707317073, "mmlu_eval_accuracy_formal_logic": 0.35714285714285715, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.25, "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, "mmlu_eval_accuracy_high_school_computer_science": 0.4444444444444444, "mmlu_eval_accuracy_high_school_european_history": 0.6666666666666666, "mmlu_eval_accuracy_high_school_geography": 0.5, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5714285714285714, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3488372093023256, "mmlu_eval_accuracy_high_school_mathematics": 0.2413793103448276, "mmlu_eval_accuracy_high_school_microeconomics": 0.19230769230769232, "mmlu_eval_accuracy_high_school_physics": 0.29411764705882354, "mmlu_eval_accuracy_high_school_psychology": 0.55, "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, "mmlu_eval_accuracy_high_school_us_history": 0.5454545454545454, "mmlu_eval_accuracy_high_school_world_history": 0.4230769230769231, "mmlu_eval_accuracy_human_aging": 0.6956521739130435, "mmlu_eval_accuracy_human_sexuality": 0.4166666666666667, "mmlu_eval_accuracy_international_law": 0.6923076923076923, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.6111111111111112, "mmlu_eval_accuracy_machine_learning": 0.09090909090909091, "mmlu_eval_accuracy_management": 0.45454545454545453, "mmlu_eval_accuracy_marketing": 0.8, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.6162790697674418, "mmlu_eval_accuracy_moral_disputes": 0.3684210526315789, "mmlu_eval_accuracy_moral_scenarios": 0.23, "mmlu_eval_accuracy_nutrition": 0.45454545454545453, "mmlu_eval_accuracy_philosophy": 0.4411764705882353, "mmlu_eval_accuracy_prehistory": 0.3142857142857143, "mmlu_eval_accuracy_professional_accounting": 0.41935483870967744, "mmlu_eval_accuracy_professional_law": 0.3411764705882353, "mmlu_eval_accuracy_professional_medicine": 0.3870967741935484, "mmlu_eval_accuracy_professional_psychology": 0.4057971014492754, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.4074074074074074, "mmlu_eval_accuracy_sociology": 0.5454545454545454, "mmlu_eval_accuracy_us_foreign_policy": 0.45454545454545453, "mmlu_eval_accuracy_virology": 0.4444444444444444, "mmlu_eval_accuracy_world_religions": 0.631578947368421, "mmlu_loss": 1.331150726058139, "step": 3179 }, { "epoch": 0.9980776021028679, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 2.5063, "step": 3180 }, { "epoch": 0.9983914629840324, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 2.0594, "step": 3181 }, { "epoch": 0.9987053238651967, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 1.7996, "step": 3182 }, { "epoch": 0.9990191847463612, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 2.5581, "step": 3183 }, { "epoch": 0.9993330456275256, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 2.3198, "step": 3184 }, { "epoch": 0.99964690650869, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 2.0987, "step": 3185 }, { "epoch": 0.9999607673898544, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 2.6696, "step": 3186 }, { "epoch": 1.0002746282710189, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 2.0604, "step": 3187 }, { "epoch": 1.0005884891521832, "grad_norm": 0.09130859375, "learning_rate": 0.0002, "loss": 1.3877, "step": 3188 }, { "epoch": 1.0009023500333478, "grad_norm": 0.10595703125, "learning_rate": 0.0002, "loss": 1.3389, "step": 3189 }, { "epoch": 1.0012162109145122, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.3807, "step": 3190 }, { "epoch": 1.0015300717956765, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.3194, "step": 3191 }, { "epoch": 1.001843932676841, "grad_norm": 0.11767578125, "learning_rate": 0.0002, "loss": 1.3926, "step": 3192 }, { "epoch": 1.0021577935580055, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.3041, "step": 3193 }, { "epoch": 1.0024716544391699, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.3575, "step": 3194 }, { "epoch": 1.0027855153203342, "grad_norm": 0.12890625, "learning_rate": 0.0002, "loss": 1.2774, "step": 3195 }, { "epoch": 1.0030993762014986, "grad_norm": 0.12890625, "learning_rate": 0.0002, "loss": 1.3192, "step": 3196 }, { "epoch": 1.0034132370826632, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.4229, "step": 3197 }, { "epoch": 1.0037270979638275, "grad_norm": 0.12451171875, "learning_rate": 0.0002, "loss": 1.3111, "step": 3198 }, { "epoch": 1.004040958844992, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.2728, "step": 3199 }, { "epoch": 1.0043548197261565, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.3139, "step": 3200 }, { "epoch": 1.0046686806073208, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.2393, "step": 3201 }, { "epoch": 1.0049825414884852, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.3164, "step": 3202 }, { "epoch": 1.0052964023696496, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.3657, "step": 3203 }, { "epoch": 1.0056102632508142, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.4096, "step": 3204 }, { "epoch": 1.0059241241319785, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.411, "step": 3205 }, { "epoch": 1.0062379850131429, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.2077, "step": 3206 }, { "epoch": 1.0065518458943072, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.2669, "step": 3207 }, { "epoch": 1.0068657067754718, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.4376, "step": 3208 }, { "epoch": 1.0071795676566362, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.1823, "step": 3209 }, { "epoch": 1.0074934285378006, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.4707, "step": 3210 }, { "epoch": 1.0078072894189651, "grad_norm": 0.1943359375, "learning_rate": 0.0002, "loss": 1.4341, "step": 3211 }, { "epoch": 1.0081211503001295, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.5259, "step": 3212 }, { "epoch": 1.0084350111812939, "grad_norm": 0.2236328125, "learning_rate": 0.0002, "loss": 1.3221, "step": 3213 }, { "epoch": 1.0087488720624582, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.7058, "step": 3214 }, { "epoch": 1.0090627329436228, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.5108, "step": 3215 }, { "epoch": 1.0093765938247872, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.6032, "step": 3216 }, { "epoch": 1.0096904547059515, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 1.7211, "step": 3217 }, { "epoch": 1.0100043155871161, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.6075, "step": 3218 }, { "epoch": 1.0103181764682805, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 2.0775, "step": 3219 }, { "epoch": 1.0106320373494448, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.9127, "step": 3220 }, { "epoch": 1.0109458982306092, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 1.9032, "step": 3221 }, { "epoch": 1.0112597591117738, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 2.2753, "step": 3222 }, { "epoch": 1.0115736199929382, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.9234, "step": 3223 }, { "epoch": 1.0118874808741025, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.9765, "step": 3224 }, { "epoch": 1.0122013417552669, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 1.9529, "step": 3225 }, { "epoch": 1.0125152026364315, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 1.9424, "step": 3226 }, { "epoch": 1.0128290635175958, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 2.0726, "step": 3227 }, { "epoch": 1.0131429243987602, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 2.0097, "step": 3228 }, { "epoch": 1.0134567852799248, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 2.1058, "step": 3229 }, { "epoch": 1.0137706461610891, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 2.1626, "step": 3230 }, { "epoch": 1.0140845070422535, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 1.7617, "step": 3231 }, { "epoch": 1.0143983679234179, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.7791, "step": 3232 }, { "epoch": 1.0147122288045825, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 1.5761, "step": 3233 }, { "epoch": 1.0150260896857468, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.8905, "step": 3234 }, { "epoch": 1.0153399505669112, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 1.9349, "step": 3235 }, { "epoch": 1.0156538114480755, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 2.2521, "step": 3236 }, { "epoch": 1.0159676723292401, "grad_norm": 0.2021484375, "learning_rate": 0.0002, "loss": 1.7394, "step": 3237 }, { "epoch": 1.0162815332104045, "grad_norm": 0.125, "learning_rate": 0.0002, "loss": 1.0921, "step": 3238 }, { "epoch": 1.0165953940915688, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.435, "step": 3239 }, { "epoch": 1.0169092549727334, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.322, "step": 3240 }, { "epoch": 1.0172231158538978, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.5587, "step": 3241 }, { "epoch": 1.0175369767350622, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.2465, "step": 3242 }, { "epoch": 1.0178508376162265, "grad_norm": 0.1279296875, "learning_rate": 0.0002, "loss": 1.3037, "step": 3243 }, { "epoch": 1.018164698497391, "grad_norm": 0.1318359375, "learning_rate": 0.0002, "loss": 1.3711, "step": 3244 }, { "epoch": 1.0184785593785555, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5833, "step": 3245 }, { "epoch": 1.0187924202597198, "grad_norm": 0.1279296875, "learning_rate": 0.0002, "loss": 1.2177, "step": 3246 }, { "epoch": 1.0191062811408842, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.3321, "step": 3247 }, { "epoch": 1.0194201420220488, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.2415, "step": 3248 }, { "epoch": 1.0197340029032131, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.2932, "step": 3249 }, { "epoch": 1.0200478637843775, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.3618, "step": 3250 }, { "epoch": 1.020361724665542, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.1451, "step": 3251 }, { "epoch": 1.0206755855467065, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.3904, "step": 3252 }, { "epoch": 1.0209894464278708, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.3366, "step": 3253 }, { "epoch": 1.0213033073090352, "grad_norm": 0.2177734375, "learning_rate": 0.0002, "loss": 1.47, "step": 3254 }, { "epoch": 1.0216171681901998, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.2812, "step": 3255 }, { "epoch": 1.0219310290713641, "grad_norm": 0.1875, "learning_rate": 0.0002, "loss": 1.3385, "step": 3256 }, { "epoch": 1.0222448899525285, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.6318, "step": 3257 }, { "epoch": 1.022558750833693, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.3832, "step": 3258 }, { "epoch": 1.0228726117148574, "grad_norm": 0.2265625, "learning_rate": 0.0002, "loss": 1.2538, "step": 3259 }, { "epoch": 1.0231864725960218, "grad_norm": 0.2392578125, "learning_rate": 0.0002, "loss": 1.6401, "step": 3260 }, { "epoch": 1.0235003334771862, "grad_norm": 0.21484375, "learning_rate": 0.0002, "loss": 1.3818, "step": 3261 }, { "epoch": 1.0238141943583507, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.5104, "step": 3262 }, { "epoch": 1.0241280552395151, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.704, "step": 3263 }, { "epoch": 1.0244419161206795, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 2.0966, "step": 3264 }, { "epoch": 1.0247557770018438, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.7171, "step": 3265 }, { "epoch": 1.0250696378830084, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 1.6738, "step": 3266 }, { "epoch": 1.0253834987641728, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.6724, "step": 3267 }, { "epoch": 1.0256973596453371, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.9199, "step": 3268 }, { "epoch": 1.0260112205265017, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 1.8901, "step": 3269 }, { "epoch": 1.026325081407666, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 1.9366, "step": 3270 }, { "epoch": 1.0266389422888305, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 1.8268, "step": 3271 }, { "epoch": 1.0269528031699948, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 1.751, "step": 3272 }, { "epoch": 1.0272666640511594, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 2.2259, "step": 3273 }, { "epoch": 1.0275805249323238, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 2.0009, "step": 3274 }, { "epoch": 1.0278943858134881, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 2.0155, "step": 3275 }, { "epoch": 1.0282082466946525, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 1.7522, "step": 3276 }, { "epoch": 1.028522107575817, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 2.2466, "step": 3277 }, { "epoch": 1.0288359684569814, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 1.8462, "step": 3278 }, { "epoch": 1.0291498293381458, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 1.9953, "step": 3279 }, { "epoch": 1.0294636902193104, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 2.185, "step": 3280 }, { "epoch": 1.0297775511004748, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 1.9967, "step": 3281 }, { "epoch": 1.0300914119816391, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 1.7523, "step": 3282 }, { "epoch": 1.0304052728628035, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 1.7159, "step": 3283 }, { "epoch": 1.030719133743968, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 1.3885, "step": 3284 }, { "epoch": 1.0310329946251324, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.5344, "step": 3285 }, { "epoch": 1.0313468555062968, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 2.2785, "step": 3286 }, { "epoch": 1.0316607163874612, "grad_norm": 0.2060546875, "learning_rate": 0.0002, "loss": 1.4047, "step": 3287 }, { "epoch": 1.0319745772686257, "grad_norm": 0.1328125, "learning_rate": 0.0002, "loss": 1.4677, "step": 3288 }, { "epoch": 1.03228843814979, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6365, "step": 3289 }, { "epoch": 1.0326022990309545, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.3259, "step": 3290 }, { "epoch": 1.032916159912119, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.2246, "step": 3291 }, { "epoch": 1.0332300207932834, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.3766, "step": 3292 }, { "epoch": 1.0335438816744478, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.4346, "step": 3293 }, { "epoch": 1.0338577425556121, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.4882, "step": 3294 }, { "epoch": 1.0341716034367767, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.2832, "step": 3295 }, { "epoch": 1.034485464317941, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.3905, "step": 3296 }, { "epoch": 1.0347993251991054, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.175, "step": 3297 }, { "epoch": 1.0351131860802698, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.4527, "step": 3298 }, { "epoch": 1.0354270469614344, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.2958, "step": 3299 }, { "epoch": 1.0357409078425988, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.272, "step": 3300 }, { "epoch": 1.0360547687237631, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.3249, "step": 3301 }, { "epoch": 1.0363686296049277, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.2704, "step": 3302 }, { "epoch": 1.036682490486092, "grad_norm": 0.1884765625, "learning_rate": 0.0002, "loss": 1.4118, "step": 3303 }, { "epoch": 1.0369963513672564, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.3258, "step": 3304 }, { "epoch": 1.0373102122484208, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.3525, "step": 3305 }, { "epoch": 1.0376240731295854, "grad_norm": 0.203125, "learning_rate": 0.0002, "loss": 1.3024, "step": 3306 }, { "epoch": 1.0379379340107497, "grad_norm": 0.2060546875, "learning_rate": 0.0002, "loss": 1.4737, "step": 3307 }, { "epoch": 1.038251794891914, "grad_norm": 0.240234375, "learning_rate": 0.0002, "loss": 1.5573, "step": 3308 }, { "epoch": 1.0385656557730787, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.495, "step": 3309 }, { "epoch": 1.038879516654243, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.4378, "step": 3310 }, { "epoch": 1.0391933775354074, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.5224, "step": 3311 }, { "epoch": 1.0395072384165718, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 1.5927, "step": 3312 }, { "epoch": 1.0398210992977364, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.3604, "step": 3313 }, { "epoch": 1.0401349601789007, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.4609, "step": 3314 }, { "epoch": 1.040448821060065, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.7497, "step": 3315 }, { "epoch": 1.0407626819412295, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.7503, "step": 3316 }, { "epoch": 1.041076542822394, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.9545, "step": 3317 }, { "epoch": 1.0413904037035584, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 2.0486, "step": 3318 }, { "epoch": 1.0417042645847228, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 1.8276, "step": 3319 }, { "epoch": 1.0420181254658873, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 1.8632, "step": 3320 }, { "epoch": 1.0423319863470517, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 1.9163, "step": 3321 }, { "epoch": 1.042645847228216, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 1.9768, "step": 3322 }, { "epoch": 1.0429597081093804, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.9516, "step": 3323 }, { "epoch": 1.043273568990545, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.8407, "step": 3324 }, { "epoch": 1.0435874298717094, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 1.9521, "step": 3325 }, { "epoch": 1.0439012907528737, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 1.9723, "step": 3326 }, { "epoch": 1.044215151634038, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.6469, "step": 3327 }, { "epoch": 1.0445290125152027, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 2.2362, "step": 3328 }, { "epoch": 1.044842873396367, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 1.6164, "step": 3329 }, { "epoch": 1.0451567342775314, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.5419, "step": 3330 }, { "epoch": 1.045470595158696, "grad_norm": 0.95703125, "learning_rate": 0.0002, "loss": 2.2275, "step": 3331 }, { "epoch": 1.0457844560398604, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 2.0776, "step": 3332 }, { "epoch": 1.0460983169210247, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 1.9309, "step": 3333 }, { "epoch": 1.046412177802189, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 2.0388, "step": 3334 }, { "epoch": 1.0467260386833537, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 2.0802, "step": 3335 }, { "epoch": 1.047039899564518, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 2.5374, "step": 3336 }, { "epoch": 1.0473537604456824, "grad_norm": 0.21484375, "learning_rate": 0.0002, "loss": 1.5432, "step": 3337 }, { "epoch": 1.047667621326847, "grad_norm": 0.11865234375, "learning_rate": 0.0002, "loss": 1.4247, "step": 3338 }, { "epoch": 1.0479814822080114, "grad_norm": 0.12255859375, "learning_rate": 0.0002, "loss": 1.2922, "step": 3339 }, { "epoch": 1.0482953430891757, "grad_norm": 0.1240234375, "learning_rate": 0.0002, "loss": 1.2041, "step": 3340 }, { "epoch": 1.04860920397034, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.4376, "step": 3341 }, { "epoch": 1.0489230648515047, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.3131, "step": 3342 }, { "epoch": 1.049236925732669, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.4706, "step": 3343 }, { "epoch": 1.0495507866138334, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.228, "step": 3344 }, { "epoch": 1.0498646474949977, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.3428, "step": 3345 }, { "epoch": 1.0501785083761623, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.2709, "step": 3346 }, { "epoch": 1.0504923692573267, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.2898, "step": 3347 }, { "epoch": 1.050806230138491, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.292, "step": 3348 }, { "epoch": 1.0511200910196556, "grad_norm": 0.19921875, "learning_rate": 0.0002, "loss": 1.3754, "step": 3349 }, { "epoch": 1.05143395190082, "grad_norm": 0.2060546875, "learning_rate": 0.0002, "loss": 1.2088, "step": 3350 }, { "epoch": 1.0517478127819844, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.3938, "step": 3351 }, { "epoch": 1.0520616736631487, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.359, "step": 3352 }, { "epoch": 1.0523755345443133, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.4313, "step": 3353 }, { "epoch": 1.0526893954254777, "grad_norm": 0.1875, "learning_rate": 0.0002, "loss": 1.4475, "step": 3354 }, { "epoch": 1.053003256306642, "grad_norm": 0.19140625, "learning_rate": 0.0002, "loss": 1.2955, "step": 3355 }, { "epoch": 1.0533171171878064, "grad_norm": 0.2275390625, "learning_rate": 0.0002, "loss": 1.4495, "step": 3356 }, { "epoch": 1.053630978068971, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 1.512, "step": 3357 }, { "epoch": 1.0539448389501354, "grad_norm": 0.197265625, "learning_rate": 0.0002, "loss": 1.3396, "step": 3358 }, { "epoch": 1.0542586998312997, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.5656, "step": 3359 }, { "epoch": 1.0545725607124643, "grad_norm": 0.23828125, "learning_rate": 0.0002, "loss": 1.4603, "step": 3360 }, { "epoch": 1.0548864215936287, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.5615, "step": 3361 }, { "epoch": 1.055200282474793, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.4789, "step": 3362 }, { "epoch": 1.0555141433559574, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.4517, "step": 3363 }, { "epoch": 1.055828004237122, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 1.6078, "step": 3364 }, { "epoch": 1.0561418651182863, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.7573, "step": 3365 }, { "epoch": 1.0564557259994507, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 1.919, "step": 3366 }, { "epoch": 1.0564557259994507, "eval_loss": 1.782015085220337, "eval_runtime": 122.9733, "eval_samples_per_second": 8.132, "eval_steps_per_second": 8.132, "step": 3366 }, { "epoch": 1.0564557259994507, "mmlu_eval_accuracy": 0.4088775674784756, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.6363636363636364, "mmlu_eval_accuracy_clinical_knowledge": 0.41379310344827586, "mmlu_eval_accuracy_college_biology": 0.1875, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454, "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, "mmlu_eval_accuracy_college_medicine": 0.3181818181818182, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.4375, "mmlu_eval_accuracy_elementary_mathematics": 0.34146341463414637, "mmlu_eval_accuracy_formal_logic": 0.35714285714285715, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.21875, "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556, "mmlu_eval_accuracy_high_school_european_history": 0.6111111111111112, "mmlu_eval_accuracy_high_school_geography": 0.5454545454545454, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5238095238095238, "mmlu_eval_accuracy_high_school_macroeconomics": 0.32558139534883723, "mmlu_eval_accuracy_high_school_mathematics": 0.20689655172413793, "mmlu_eval_accuracy_high_school_microeconomics": 0.19230769230769232, "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, "mmlu_eval_accuracy_high_school_psychology": 0.6, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.5454545454545454, "mmlu_eval_accuracy_high_school_world_history": 0.38461538461538464, "mmlu_eval_accuracy_human_aging": 0.6956521739130435, "mmlu_eval_accuracy_human_sexuality": 0.4166666666666667, "mmlu_eval_accuracy_international_law": 0.6923076923076923, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.6111111111111112, "mmlu_eval_accuracy_machine_learning": 0.18181818181818182, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.8, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.6162790697674418, "mmlu_eval_accuracy_moral_disputes": 0.3684210526315789, "mmlu_eval_accuracy_moral_scenarios": 0.24, "mmlu_eval_accuracy_nutrition": 0.48484848484848486, "mmlu_eval_accuracy_philosophy": 0.47058823529411764, "mmlu_eval_accuracy_prehistory": 0.37142857142857144, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.3, "mmlu_eval_accuracy_professional_medicine": 0.45161290322580644, "mmlu_eval_accuracy_professional_psychology": 0.34782608695652173, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.5454545454545454, "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, "mmlu_eval_accuracy_virology": 0.3888888888888889, "mmlu_eval_accuracy_world_religions": 0.6842105263157895, "mmlu_loss": 1.275355286920561, "step": 3366 }, { "epoch": 1.056769586880615, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 1.7448, "step": 3367 }, { "epoch": 1.0570834477617796, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 1.7405, "step": 3368 }, { "epoch": 1.057397308642944, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.8165, "step": 3369 }, { "epoch": 1.0577111695241084, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 2.2684, "step": 3370 }, { "epoch": 1.058025030405273, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 1.6534, "step": 3371 }, { "epoch": 1.0583388912864373, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 2.0191, "step": 3372 }, { "epoch": 1.0586527521676017, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.571, "step": 3373 }, { "epoch": 1.058966613048766, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 1.961, "step": 3374 }, { "epoch": 1.0592804739299306, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 1.9378, "step": 3375 }, { "epoch": 1.059594334811095, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.8649, "step": 3376 }, { "epoch": 1.0599081956922594, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 1.7798, "step": 3377 }, { "epoch": 1.0602220565734237, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 1.8226, "step": 3378 }, { "epoch": 1.0605359174545883, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 2.0467, "step": 3379 }, { "epoch": 1.0608497783357527, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 2.1341, "step": 3380 }, { "epoch": 1.061163639216917, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 1.6338, "step": 3381 }, { "epoch": 1.0614775000980816, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 2.1787, "step": 3382 }, { "epoch": 1.061791360979246, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 1.8559, "step": 3383 }, { "epoch": 1.0621052218604103, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 2.1064, "step": 3384 }, { "epoch": 1.0624190827415747, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 1.6618, "step": 3385 }, { "epoch": 1.0627329436227393, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 2.4034, "step": 3386 }, { "epoch": 1.0630468045039037, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.4687, "step": 3387 }, { "epoch": 1.063360665385068, "grad_norm": 0.2041015625, "learning_rate": 0.0002, "loss": 1.3391, "step": 3388 }, { "epoch": 1.0636745262662326, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.3221, "step": 3389 }, { "epoch": 1.063988387147397, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.3004, "step": 3390 }, { "epoch": 1.0643022480285613, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.3623, "step": 3391 }, { "epoch": 1.0646161089097257, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.207, "step": 3392 }, { "epoch": 1.0649299697908903, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.3389, "step": 3393 }, { "epoch": 1.0652438306720546, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.4117, "step": 3394 }, { "epoch": 1.065557691553219, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.4149, "step": 3395 }, { "epoch": 1.0658715524343834, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5053, "step": 3396 }, { "epoch": 1.066185413315548, "grad_norm": 0.1923828125, "learning_rate": 0.0002, "loss": 1.6008, "step": 3397 }, { "epoch": 1.0664992741967123, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.401, "step": 3398 }, { "epoch": 1.0668131350778767, "grad_norm": 0.234375, "learning_rate": 0.0002, "loss": 1.4048, "step": 3399 }, { "epoch": 1.0671269959590413, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.3891, "step": 3400 }, { "epoch": 1.0674408568402056, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.3934, "step": 3401 }, { "epoch": 1.06775471772137, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.4672, "step": 3402 }, { "epoch": 1.0680685786025343, "grad_norm": 0.1982421875, "learning_rate": 0.0002, "loss": 1.3179, "step": 3403 }, { "epoch": 1.068382439483699, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.3361, "step": 3404 }, { "epoch": 1.0686963003648633, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.2874, "step": 3405 }, { "epoch": 1.0690101612460277, "grad_norm": 0.19140625, "learning_rate": 0.0002, "loss": 1.4479, "step": 3406 }, { "epoch": 1.069324022127192, "grad_norm": 0.21484375, "learning_rate": 0.0002, "loss": 1.2963, "step": 3407 }, { "epoch": 1.0696378830083566, "grad_norm": 0.1953125, "learning_rate": 0.0002, "loss": 1.352, "step": 3408 }, { "epoch": 1.069951743889521, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.2518, "step": 3409 }, { "epoch": 1.0702656047706853, "grad_norm": 0.2275390625, "learning_rate": 0.0002, "loss": 1.3461, "step": 3410 }, { "epoch": 1.07057946565185, "grad_norm": 0.2392578125, "learning_rate": 0.0002, "loss": 1.3596, "step": 3411 }, { "epoch": 1.0708933265330143, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.4757, "step": 3412 }, { "epoch": 1.0712071874141786, "grad_norm": 0.2412109375, "learning_rate": 0.0002, "loss": 1.4412, "step": 3413 }, { "epoch": 1.071521048295343, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.3274, "step": 3414 }, { "epoch": 1.0718349091765076, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 1.6775, "step": 3415 }, { "epoch": 1.072148770057672, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 1.5285, "step": 3416 }, { "epoch": 1.0724626309388363, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.9699, "step": 3417 }, { "epoch": 1.072776491820001, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 1.8587, "step": 3418 }, { "epoch": 1.0730903527011653, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 2.1302, "step": 3419 }, { "epoch": 1.0734042135823296, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 2.017, "step": 3420 }, { "epoch": 1.073718074463494, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 1.7599, "step": 3421 }, { "epoch": 1.0740319353446586, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 1.8603, "step": 3422 }, { "epoch": 1.074345796225823, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 1.7622, "step": 3423 }, { "epoch": 1.0746596571069873, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 1.8043, "step": 3424 }, { "epoch": 1.0749735179881517, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 2.0542, "step": 3425 }, { "epoch": 1.0752873788693162, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 2.1464, "step": 3426 }, { "epoch": 1.0756012397504806, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 2.0404, "step": 3427 }, { "epoch": 1.075915100631645, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.6908, "step": 3428 }, { "epoch": 1.0762289615128093, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 2.0362, "step": 3429 }, { "epoch": 1.076542822393974, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 2.2215, "step": 3430 }, { "epoch": 1.0768566832751383, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 1.4921, "step": 3431 }, { "epoch": 1.0771705441563026, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 1.6788, "step": 3432 }, { "epoch": 1.0774844050374672, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 1.6164, "step": 3433 }, { "epoch": 1.0777982659186316, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 2.1122, "step": 3434 }, { "epoch": 1.078112126799796, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.5735, "step": 3435 }, { "epoch": 1.0784259876809603, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 2.0643, "step": 3436 }, { "epoch": 1.078739848562125, "grad_norm": 0.1923828125, "learning_rate": 0.0002, "loss": 1.5819, "step": 3437 }, { "epoch": 1.0790537094432893, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.2702, "step": 3438 }, { "epoch": 1.0793675703244536, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.3787, "step": 3439 }, { "epoch": 1.0796814312056182, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.3857, "step": 3440 }, { "epoch": 1.0799952920867826, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.3512, "step": 3441 }, { "epoch": 1.080309152967947, "grad_norm": 0.12890625, "learning_rate": 0.0002, "loss": 1.2302, "step": 3442 }, { "epoch": 1.0806230138491113, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.4563, "step": 3443 }, { "epoch": 1.0809368747302759, "grad_norm": 0.228515625, "learning_rate": 0.0002, "loss": 1.3174, "step": 3444 }, { "epoch": 1.0812507356114403, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.5022, "step": 3445 }, { "epoch": 1.0815645964926046, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.2804, "step": 3446 }, { "epoch": 1.081878457373769, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.402, "step": 3447 }, { "epoch": 1.0821923182549336, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.3564, "step": 3448 }, { "epoch": 1.082506179136098, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.4267, "step": 3449 }, { "epoch": 1.0828200400172623, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.2007, "step": 3450 }, { "epoch": 1.0831339008984269, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.3224, "step": 3451 }, { "epoch": 1.0834477617795912, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.3205, "step": 3452 }, { "epoch": 1.0837616226607556, "grad_norm": 0.1865234375, "learning_rate": 0.0002, "loss": 1.3749, "step": 3453 }, { "epoch": 1.08407548354192, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.2017, "step": 3454 }, { "epoch": 1.0843893444230845, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.2634, "step": 3455 }, { "epoch": 1.084703205304249, "grad_norm": 0.2236328125, "learning_rate": 0.0002, "loss": 1.3602, "step": 3456 }, { "epoch": 1.0850170661854133, "grad_norm": 0.2255859375, "learning_rate": 0.0002, "loss": 1.3545, "step": 3457 }, { "epoch": 1.0853309270665776, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 1.3156, "step": 3458 }, { "epoch": 1.0856447879477422, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.6673, "step": 3459 }, { "epoch": 1.0859586488289066, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.5901, "step": 3460 }, { "epoch": 1.086272509710071, "grad_norm": 0.234375, "learning_rate": 0.0002, "loss": 1.2643, "step": 3461 }, { "epoch": 1.0865863705912355, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 1.6226, "step": 3462 }, { "epoch": 1.0869002314724, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.3418, "step": 3463 }, { "epoch": 1.0872140923535643, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.7287, "step": 3464 }, { "epoch": 1.0875279532347286, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.5898, "step": 3465 }, { "epoch": 1.0878418141158932, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 1.8764, "step": 3466 }, { "epoch": 1.0881556749970576, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 1.5382, "step": 3467 }, { "epoch": 1.088469535878222, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 1.8086, "step": 3468 }, { "epoch": 1.0887833967593865, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.5665, "step": 3469 }, { "epoch": 1.0890972576405509, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 2.0351, "step": 3470 }, { "epoch": 1.0894111185217152, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 2.0884, "step": 3471 }, { "epoch": 1.0897249794028796, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 1.8107, "step": 3472 }, { "epoch": 1.0900388402840442, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 1.7368, "step": 3473 }, { "epoch": 1.0903527011652085, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.4573, "step": 3474 }, { "epoch": 1.090666562046373, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 2.1211, "step": 3475 }, { "epoch": 1.0909804229275373, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 2.2903, "step": 3476 }, { "epoch": 1.0912942838087019, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 2.309, "step": 3477 }, { "epoch": 1.0916081446898662, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 2.4323, "step": 3478 }, { "epoch": 1.0919220055710306, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 1.7412, "step": 3479 }, { "epoch": 1.092235866452195, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 1.823, "step": 3480 }, { "epoch": 1.0925497273333595, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 2.1499, "step": 3481 }, { "epoch": 1.092863588214524, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.7287, "step": 3482 }, { "epoch": 1.0931774490956883, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 1.7538, "step": 3483 }, { "epoch": 1.0934913099768528, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 1.8354, "step": 3484 }, { "epoch": 1.0938051708580172, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 2.2461, "step": 3485 }, { "epoch": 1.0941190317391816, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 2.5617, "step": 3486 }, { "epoch": 1.094432892620346, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.5741, "step": 3487 }, { "epoch": 1.0947467535015105, "grad_norm": 0.1845703125, "learning_rate": 0.0002, "loss": 1.3492, "step": 3488 }, { "epoch": 1.0950606143826749, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.3904, "step": 3489 }, { "epoch": 1.0953744752638392, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 1.3113, "step": 3490 }, { "epoch": 1.0956883361450038, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.3793, "step": 3491 }, { "epoch": 1.0960021970261682, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.3095, "step": 3492 }, { "epoch": 1.0963160579073326, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.3662, "step": 3493 }, { "epoch": 1.096629918788497, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.2684, "step": 3494 }, { "epoch": 1.0969437796696615, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.3659, "step": 3495 }, { "epoch": 1.0972576405508259, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.3242, "step": 3496 }, { "epoch": 1.0975715014319902, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.3947, "step": 3497 }, { "epoch": 1.0978853623131548, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.2836, "step": 3498 }, { "epoch": 1.0981992231943192, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.1691, "step": 3499 }, { "epoch": 1.0985130840754835, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.3641, "step": 3500 }, { "epoch": 1.098826944956648, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.4494, "step": 3501 }, { "epoch": 1.0991408058378125, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.2806, "step": 3502 }, { "epoch": 1.0994546667189768, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.3992, "step": 3503 }, { "epoch": 1.0997685276001412, "grad_norm": 0.224609375, "learning_rate": 0.0002, "loss": 1.4798, "step": 3504 }, { "epoch": 1.1000823884813056, "grad_norm": 0.203125, "learning_rate": 0.0002, "loss": 1.332, "step": 3505 }, { "epoch": 1.1003962493624702, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 1.3876, "step": 3506 }, { "epoch": 1.1007101102436345, "grad_norm": 0.236328125, "learning_rate": 0.0002, "loss": 1.5442, "step": 3507 }, { "epoch": 1.1010239711247989, "grad_norm": 0.212890625, "learning_rate": 0.0002, "loss": 1.2679, "step": 3508 }, { "epoch": 1.1013378320059632, "grad_norm": 0.2001953125, "learning_rate": 0.0002, "loss": 1.4113, "step": 3509 }, { "epoch": 1.1016516928871278, "grad_norm": 0.2392578125, "learning_rate": 0.0002, "loss": 1.5659, "step": 3510 }, { "epoch": 1.1019655537682922, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.6196, "step": 3511 }, { "epoch": 1.1022794146494566, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.4541, "step": 3512 }, { "epoch": 1.1025932755306211, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 1.64, "step": 3513 }, { "epoch": 1.1029071364117855, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.4209, "step": 3514 }, { "epoch": 1.1032209972929499, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.5357, "step": 3515 }, { "epoch": 1.1035348581741142, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 1.8676, "step": 3516 }, { "epoch": 1.1038487190552788, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.4921, "step": 3517 }, { "epoch": 1.1041625799364432, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 1.8682, "step": 3518 }, { "epoch": 1.1044764408176075, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.6149, "step": 3519 }, { "epoch": 1.1047903016987721, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 1.7332, "step": 3520 }, { "epoch": 1.1051041625799365, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 2.0951, "step": 3521 }, { "epoch": 1.1054180234611009, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 1.7416, "step": 3522 }, { "epoch": 1.1057318843422652, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 1.9467, "step": 3523 }, { "epoch": 1.1060457452234298, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 1.9566, "step": 3524 }, { "epoch": 1.1063596061045942, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 2.1813, "step": 3525 }, { "epoch": 1.1066734669857585, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 2.0633, "step": 3526 }, { "epoch": 1.1069873278669229, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 2.1788, "step": 3527 }, { "epoch": 1.1073011887480875, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.8167, "step": 3528 }, { "epoch": 1.1076150496292518, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 1.9935, "step": 3529 }, { "epoch": 1.1079289105104162, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 1.9675, "step": 3530 }, { "epoch": 1.1082427713915808, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 2.2365, "step": 3531 }, { "epoch": 1.1085566322727451, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 1.8775, "step": 3532 }, { "epoch": 1.1088704931539095, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.58, "step": 3533 }, { "epoch": 1.1091843540350739, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 2.2402, "step": 3534 }, { "epoch": 1.1094982149162385, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 1.6955, "step": 3535 }, { "epoch": 1.1098120757974028, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 2.1472, "step": 3536 }, { "epoch": 1.1101259366785672, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.4235, "step": 3537 }, { "epoch": 1.1104397975597315, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.299, "step": 3538 }, { "epoch": 1.1107536584408961, "grad_norm": 0.1318359375, "learning_rate": 0.0002, "loss": 1.2197, "step": 3539 }, { "epoch": 1.1110675193220605, "grad_norm": 0.1875, "learning_rate": 0.0002, "loss": 1.5051, "step": 3540 }, { "epoch": 1.1113813802032249, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.3248, "step": 3541 }, { "epoch": 1.1116952410843894, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5145, "step": 3542 }, { "epoch": 1.1120091019655538, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.3977, "step": 3543 }, { "epoch": 1.1123229628467182, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.4126, "step": 3544 }, { "epoch": 1.1126368237278825, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.4288, "step": 3545 }, { "epoch": 1.1129506846090471, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.2885, "step": 3546 }, { "epoch": 1.1132645454902115, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.2296, "step": 3547 }, { "epoch": 1.1135784063713758, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.5819, "step": 3548 }, { "epoch": 1.1138922672525404, "grad_norm": 0.185546875, "learning_rate": 0.0002, "loss": 1.3405, "step": 3549 }, { "epoch": 1.1142061281337048, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.38, "step": 3550 }, { "epoch": 1.1145199890148692, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.3125, "step": 3551 }, { "epoch": 1.1148338498960335, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.2946, "step": 3552 }, { "epoch": 1.115147710777198, "grad_norm": 0.232421875, "learning_rate": 0.0002, "loss": 1.4761, "step": 3553 }, { "epoch": 1.115147710777198, "eval_loss": 1.785923719406128, "eval_runtime": 122.646, "eval_samples_per_second": 8.154, "eval_steps_per_second": 8.154, "step": 3553 }, { "epoch": 1.115147710777198, "mmlu_eval_accuracy": 0.41082880955145434, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.4375, "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, "mmlu_eval_accuracy_clinical_knowledge": 0.41379310344827586, "mmlu_eval_accuracy_college_biology": 0.1875, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.45454545454545453, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.2727272727272727, "mmlu_eval_accuracy_computer_security": 0.2727272727272727, "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.3125, "mmlu_eval_accuracy_elementary_mathematics": 0.36585365853658536, "mmlu_eval_accuracy_formal_logic": 0.35714285714285715, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.34375, "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365, "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556, "mmlu_eval_accuracy_high_school_european_history": 0.7222222222222222, "mmlu_eval_accuracy_high_school_geography": 0.5454545454545454, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5238095238095238, "mmlu_eval_accuracy_high_school_macroeconomics": 0.37209302325581395, "mmlu_eval_accuracy_high_school_mathematics": 0.2413793103448276, "mmlu_eval_accuracy_high_school_microeconomics": 0.23076923076923078, "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, "mmlu_eval_accuracy_high_school_psychology": 0.6, "mmlu_eval_accuracy_high_school_statistics": 0.17391304347826086, "mmlu_eval_accuracy_high_school_us_history": 0.5454545454545454, "mmlu_eval_accuracy_high_school_world_history": 0.38461538461538464, "mmlu_eval_accuracy_human_aging": 0.6521739130434783, "mmlu_eval_accuracy_human_sexuality": 0.5, "mmlu_eval_accuracy_international_law": 0.6923076923076923, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.6111111111111112, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.45454545454545453, "mmlu_eval_accuracy_marketing": 0.72, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.5697674418604651, "mmlu_eval_accuracy_moral_disputes": 0.39473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.29, "mmlu_eval_accuracy_nutrition": 0.48484848484848486, "mmlu_eval_accuracy_philosophy": 0.47058823529411764, "mmlu_eval_accuracy_prehistory": 0.3142857142857143, "mmlu_eval_accuracy_professional_accounting": 0.41935483870967744, "mmlu_eval_accuracy_professional_law": 0.31176470588235294, "mmlu_eval_accuracy_professional_medicine": 0.45161290322580644, "mmlu_eval_accuracy_professional_psychology": 0.4057971014492754, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.4444444444444444, "mmlu_eval_accuracy_sociology": 0.5454545454545454, "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, "mmlu_eval_accuracy_virology": 0.5, "mmlu_eval_accuracy_world_religions": 0.6842105263157895, "mmlu_loss": 1.1708553617176551, "step": 3553 }, { "epoch": 1.1154615716583625, "grad_norm": 0.2119140625, "learning_rate": 0.0002, "loss": 1.4877, "step": 3554 }, { "epoch": 1.1157754325395268, "grad_norm": 0.216796875, "learning_rate": 0.0002, "loss": 1.3422, "step": 3555 }, { "epoch": 1.1160892934206912, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.5457, "step": 3556 }, { "epoch": 1.1164031543018558, "grad_norm": 0.2216796875, "learning_rate": 0.0002, "loss": 1.5776, "step": 3557 }, { "epoch": 1.1167170151830201, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.3775, "step": 3558 }, { "epoch": 1.1170308760641845, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 1.4051, "step": 3559 }, { "epoch": 1.1173447369453489, "grad_norm": 0.244140625, "learning_rate": 0.0002, "loss": 1.2419, "step": 3560 }, { "epoch": 1.1176585978265134, "grad_norm": 0.2177734375, "learning_rate": 0.0002, "loss": 1.3213, "step": 3561 }, { "epoch": 1.1179724587076778, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.5882, "step": 3562 }, { "epoch": 1.1182863195888422, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 1.4452, "step": 3563 }, { "epoch": 1.1186001804700068, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.5798, "step": 3564 }, { "epoch": 1.1189140413511711, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 1.8714, "step": 3565 }, { "epoch": 1.1192279022323355, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.5531, "step": 3566 }, { "epoch": 1.1195417631134998, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 1.5999, "step": 3567 }, { "epoch": 1.1198556239946644, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.6179, "step": 3568 }, { "epoch": 1.1201694848758288, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.8053, "step": 3569 }, { "epoch": 1.1204833457569932, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 1.7818, "step": 3570 }, { "epoch": 1.1207972066381577, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 1.8914, "step": 3571 }, { "epoch": 1.121111067519322, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 1.9739, "step": 3572 }, { "epoch": 1.1214249284004865, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 2.1106, "step": 3573 }, { "epoch": 1.1217387892816508, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 1.9206, "step": 3574 }, { "epoch": 1.1220526501628154, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 1.8017, "step": 3575 }, { "epoch": 1.1223665110439798, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.6244, "step": 3576 }, { "epoch": 1.1226803719251441, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 1.9483, "step": 3577 }, { "epoch": 1.1229942328063085, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 1.5826, "step": 3578 }, { "epoch": 1.123308093687473, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 1.9219, "step": 3579 }, { "epoch": 1.1236219545686374, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 2.1785, "step": 3580 }, { "epoch": 1.1239358154498018, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 2.1084, "step": 3581 }, { "epoch": 1.1242496763309664, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 2.7719, "step": 3582 }, { "epoch": 1.1245635372121308, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.6691, "step": 3583 }, { "epoch": 1.1248773980932951, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 1.8765, "step": 3584 }, { "epoch": 1.1251912589744595, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 1.8913, "step": 3585 }, { "epoch": 1.125505119855624, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 2.8588, "step": 3586 }, { "epoch": 1.1258189807367884, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.473, "step": 3587 }, { "epoch": 1.1261328416179528, "grad_norm": 0.1181640625, "learning_rate": 0.0002, "loss": 1.2529, "step": 3588 }, { "epoch": 1.1264467024991172, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.3749, "step": 3589 }, { "epoch": 1.1267605633802817, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.2713, "step": 3590 }, { "epoch": 1.127074424261446, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.4169, "step": 3591 }, { "epoch": 1.1273882851426105, "grad_norm": 0.130859375, "learning_rate": 0.0002, "loss": 1.245, "step": 3592 }, { "epoch": 1.127702146023775, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.3938, "step": 3593 }, { "epoch": 1.1280160069049394, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.3715, "step": 3594 }, { "epoch": 1.1283298677861038, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.3243, "step": 3595 }, { "epoch": 1.1286437286672681, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.2939, "step": 3596 }, { "epoch": 1.1289575895484327, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.2146, "step": 3597 }, { "epoch": 1.129271450429597, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.3606, "step": 3598 }, { "epoch": 1.1295853113107615, "grad_norm": 0.1845703125, "learning_rate": 0.0002, "loss": 1.4294, "step": 3599 }, { "epoch": 1.129899172191926, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.4255, "step": 3600 }, { "epoch": 1.1302130330730904, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.4143, "step": 3601 }, { "epoch": 1.1305268939542548, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.3901, "step": 3602 }, { "epoch": 1.1308407548354191, "grad_norm": 0.203125, "learning_rate": 0.0002, "loss": 1.4703, "step": 3603 }, { "epoch": 1.1311546157165837, "grad_norm": 0.208984375, "learning_rate": 0.0002, "loss": 1.2592, "step": 3604 }, { "epoch": 1.131468476597748, "grad_norm": 0.224609375, "learning_rate": 0.0002, "loss": 1.3548, "step": 3605 }, { "epoch": 1.1317823374789124, "grad_norm": 0.2109375, "learning_rate": 0.0002, "loss": 1.5083, "step": 3606 }, { "epoch": 1.1320961983600768, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.5108, "step": 3607 }, { "epoch": 1.1324100592412414, "grad_norm": 0.232421875, "learning_rate": 0.0002, "loss": 1.2707, "step": 3608 }, { "epoch": 1.1327239201224057, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.5571, "step": 3609 }, { "epoch": 1.13303778100357, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.5364, "step": 3610 }, { "epoch": 1.1333516418847345, "grad_norm": 0.2216796875, "learning_rate": 0.0002, "loss": 1.2, "step": 3611 }, { "epoch": 1.133665502765899, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.3271, "step": 3612 }, { "epoch": 1.1339793636470634, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.3633, "step": 3613 }, { "epoch": 1.1342932245282278, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.514, "step": 3614 }, { "epoch": 1.1346070854093924, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 1.6272, "step": 3615 }, { "epoch": 1.1349209462905567, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.6598, "step": 3616 }, { "epoch": 1.135234807171721, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 1.7269, "step": 3617 }, { "epoch": 1.1355486680528855, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 1.8545, "step": 3618 }, { "epoch": 1.13586252893405, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.9285, "step": 3619 }, { "epoch": 1.1361763898152144, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 1.7557, "step": 3620 }, { "epoch": 1.1364902506963788, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 1.9605, "step": 3621 }, { "epoch": 1.1368041115775434, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 2.262, "step": 3622 }, { "epoch": 1.1371179724587077, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 1.7123, "step": 3623 }, { "epoch": 1.137431833339872, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 2.228, "step": 3624 }, { "epoch": 1.1377456942210364, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.6424, "step": 3625 }, { "epoch": 1.138059555102201, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 2.1712, "step": 3626 }, { "epoch": 1.1383734159833654, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.734, "step": 3627 }, { "epoch": 1.1386872768645298, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 1.7847, "step": 3628 }, { "epoch": 1.1390011377456943, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 1.7135, "step": 3629 }, { "epoch": 1.1393149986268587, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 2.2577, "step": 3630 }, { "epoch": 1.139628859508023, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.34, "step": 3631 }, { "epoch": 1.1399427203891874, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 2.2824, "step": 3632 }, { "epoch": 1.140256581270352, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 1.7479, "step": 3633 }, { "epoch": 1.1405704421515164, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 1.5148, "step": 3634 }, { "epoch": 1.1408843030326807, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 1.7588, "step": 3635 }, { "epoch": 1.141198163913845, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 2.4824, "step": 3636 }, { "epoch": 1.1415120247950097, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 1.9549, "step": 3637 }, { "epoch": 1.141825885676174, "grad_norm": 0.1162109375, "learning_rate": 0.0002, "loss": 1.3516, "step": 3638 }, { "epoch": 1.1421397465573384, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.3547, "step": 3639 }, { "epoch": 1.1424536074385028, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.2967, "step": 3640 }, { "epoch": 1.1427674683196674, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.4629, "step": 3641 }, { "epoch": 1.1430813292008317, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6071, "step": 3642 }, { "epoch": 1.143395190081996, "grad_norm": 0.1953125, "learning_rate": 0.0002, "loss": 1.5825, "step": 3643 }, { "epoch": 1.1437090509631607, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.3035, "step": 3644 }, { "epoch": 1.144022911844325, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.4255, "step": 3645 }, { "epoch": 1.1443367727254894, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.2383, "step": 3646 }, { "epoch": 1.1446506336066538, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5538, "step": 3647 }, { "epoch": 1.1449644944878183, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.3389, "step": 3648 }, { "epoch": 1.1452783553689827, "grad_norm": 0.2001953125, "learning_rate": 0.0002, "loss": 1.4563, "step": 3649 }, { "epoch": 1.145592216250147, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.2892, "step": 3650 }, { "epoch": 1.1459060771313117, "grad_norm": 0.193359375, "learning_rate": 0.0002, "loss": 1.3073, "step": 3651 }, { "epoch": 1.146219938012476, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.3571, "step": 3652 }, { "epoch": 1.1465337988936404, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.2609, "step": 3653 }, { "epoch": 1.1468476597748047, "grad_norm": 0.24609375, "learning_rate": 0.0002, "loss": 1.4362, "step": 3654 }, { "epoch": 1.1471615206559693, "grad_norm": 0.1875, "learning_rate": 0.0002, "loss": 1.4871, "step": 3655 }, { "epoch": 1.1474753815371337, "grad_norm": 0.232421875, "learning_rate": 0.0002, "loss": 1.2907, "step": 3656 }, { "epoch": 1.147789242418298, "grad_norm": 0.19140625, "learning_rate": 0.0002, "loss": 1.258, "step": 3657 }, { "epoch": 1.1481031032994626, "grad_norm": 0.2021484375, "learning_rate": 0.0002, "loss": 1.3671, "step": 3658 }, { "epoch": 1.148416964180627, "grad_norm": 0.1875, "learning_rate": 0.0002, "loss": 1.233, "step": 3659 }, { "epoch": 1.1487308250617914, "grad_norm": 0.220703125, "learning_rate": 0.0002, "loss": 1.3437, "step": 3660 }, { "epoch": 1.1490446859429557, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.419, "step": 3661 }, { "epoch": 1.14935854682412, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.6437, "step": 3662 }, { "epoch": 1.1496724077052847, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 1.9029, "step": 3663 }, { "epoch": 1.149986268586449, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.5138, "step": 3664 }, { "epoch": 1.1503001294676134, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.5226, "step": 3665 }, { "epoch": 1.150613990348778, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 1.9024, "step": 3666 }, { "epoch": 1.1509278512299423, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 1.8113, "step": 3667 }, { "epoch": 1.1512417121111067, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 2.169, "step": 3668 }, { "epoch": 1.151555572992271, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.7122, "step": 3669 }, { "epoch": 1.1518694338734357, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 2.3097, "step": 3670 }, { "epoch": 1.1521832947546, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 2.2051, "step": 3671 }, { "epoch": 1.1524971556357644, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 1.7081, "step": 3672 }, { "epoch": 1.152811016516929, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 1.8949, "step": 3673 }, { "epoch": 1.1531248773980933, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 2.0396, "step": 3674 }, { "epoch": 1.1534387382792577, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 2.1069, "step": 3675 }, { "epoch": 1.153752599160422, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 2.4829, "step": 3676 }, { "epoch": 1.1540664600415866, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 1.8779, "step": 3677 }, { "epoch": 1.154380320922751, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.9209, "step": 3678 }, { "epoch": 1.1546941818039154, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 1.9027, "step": 3679 }, { "epoch": 1.15500804268508, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 2.2018, "step": 3680 }, { "epoch": 1.1553219035662443, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.5032, "step": 3681 }, { "epoch": 1.1556357644474087, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 1.9819, "step": 3682 }, { "epoch": 1.155949625328573, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 1.6673, "step": 3683 }, { "epoch": 1.1562634862097376, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 2.1791, "step": 3684 }, { "epoch": 1.156577347090902, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 2.086, "step": 3685 }, { "epoch": 1.1568912079720663, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 2.2853, "step": 3686 }, { "epoch": 1.1572050688532307, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5367, "step": 3687 }, { "epoch": 1.1575189297343953, "grad_norm": 0.12451171875, "learning_rate": 0.0002, "loss": 1.3871, "step": 3688 }, { "epoch": 1.1578327906155597, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.4207, "step": 3689 }, { "epoch": 1.158146651496724, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.2606, "step": 3690 }, { "epoch": 1.1584605123778884, "grad_norm": 0.1884765625, "learning_rate": 0.0002, "loss": 1.5775, "step": 3691 }, { "epoch": 1.158774373259053, "grad_norm": 0.1875, "learning_rate": 0.0002, "loss": 1.3002, "step": 3692 }, { "epoch": 1.1590882341402173, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.2388, "step": 3693 }, { "epoch": 1.1594020950213817, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.3587, "step": 3694 }, { "epoch": 1.1597159559025463, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.3413, "step": 3695 }, { "epoch": 1.1600298167837106, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.352, "step": 3696 }, { "epoch": 1.160343677664875, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.1805, "step": 3697 }, { "epoch": 1.1606575385460394, "grad_norm": 0.189453125, "learning_rate": 0.0002, "loss": 1.4525, "step": 3698 }, { "epoch": 1.160971399427204, "grad_norm": 0.2119140625, "learning_rate": 0.0002, "loss": 1.3703, "step": 3699 }, { "epoch": 1.1612852603083683, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.3686, "step": 3700 }, { "epoch": 1.1615991211895327, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.2846, "step": 3701 }, { "epoch": 1.1619129820706973, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.3523, "step": 3702 }, { "epoch": 1.1622268429518616, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.4451, "step": 3703 }, { "epoch": 1.162540703833026, "grad_norm": 0.205078125, "learning_rate": 0.0002, "loss": 1.4352, "step": 3704 }, { "epoch": 1.1628545647141904, "grad_norm": 0.1982421875, "learning_rate": 0.0002, "loss": 1.4363, "step": 3705 }, { "epoch": 1.163168425595355, "grad_norm": 0.20703125, "learning_rate": 0.0002, "loss": 1.3166, "step": 3706 }, { "epoch": 1.1634822864765193, "grad_norm": 0.2158203125, "learning_rate": 0.0002, "loss": 1.3889, "step": 3707 }, { "epoch": 1.1637961473576837, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.4201, "step": 3708 }, { "epoch": 1.1641100082388482, "grad_norm": 0.216796875, "learning_rate": 0.0002, "loss": 1.3659, "step": 3709 }, { "epoch": 1.1644238691200126, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 1.6474, "step": 3710 }, { "epoch": 1.164737730001177, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.538, "step": 3711 }, { "epoch": 1.1650515908823413, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 1.8065, "step": 3712 }, { "epoch": 1.165365451763506, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.5863, "step": 3713 }, { "epoch": 1.1656793126446703, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.583, "step": 3714 }, { "epoch": 1.1659931735258346, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 1.7434, "step": 3715 }, { "epoch": 1.166307034406999, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 2.264, "step": 3716 }, { "epoch": 1.1666208952881636, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.3905, "step": 3717 }, { "epoch": 1.166934756169328, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 1.6922, "step": 3718 }, { "epoch": 1.1672486170504923, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 2.1253, "step": 3719 }, { "epoch": 1.1675624779316567, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 1.9142, "step": 3720 }, { "epoch": 1.1678763388128213, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 2.3627, "step": 3721 }, { "epoch": 1.1681901996939856, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.8973, "step": 3722 }, { "epoch": 1.16850406057515, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 2.0719, "step": 3723 }, { "epoch": 1.1688179214563146, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 2.2261, "step": 3724 }, { "epoch": 1.169131782337479, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 1.9728, "step": 3725 }, { "epoch": 1.1694456432186433, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 1.9261, "step": 3726 }, { "epoch": 1.1697595040998077, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 1.9135, "step": 3727 }, { "epoch": 1.1700733649809723, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 1.9522, "step": 3728 }, { "epoch": 1.1703872258621366, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 1.5351, "step": 3729 }, { "epoch": 1.170701086743301, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 1.7964, "step": 3730 }, { "epoch": 1.1710149476244656, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.8993, "step": 3731 }, { "epoch": 1.17132880850563, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.8904, "step": 3732 }, { "epoch": 1.1716426693867943, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 1.7531, "step": 3733 }, { "epoch": 1.1719565302679587, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.8729, "step": 3734 }, { "epoch": 1.1722703911491232, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 1.9941, "step": 3735 }, { "epoch": 1.1725842520302876, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 3.032, "step": 3736 }, { "epoch": 1.172898112911452, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.227, "step": 3737 }, { "epoch": 1.1732119737926165, "grad_norm": 0.1298828125, "learning_rate": 0.0002, "loss": 1.2885, "step": 3738 }, { "epoch": 1.173525834673781, "grad_norm": 0.1240234375, "learning_rate": 0.0002, "loss": 1.2461, "step": 3739 }, { "epoch": 1.1738396955549453, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.4896, "step": 3740 }, { "epoch": 1.1738396955549453, "eval_loss": 1.7994587421417236, "eval_runtime": 123.3438, "eval_samples_per_second": 8.107, "eval_steps_per_second": 8.107, "step": 3740 }, { "epoch": 1.1738396955549453, "mmlu_eval_accuracy": 0.40424781951426475, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.4375, "mmlu_eval_accuracy_business_ethics": 0.6363636363636364, "mmlu_eval_accuracy_clinical_knowledge": 0.41379310344827586, "mmlu_eval_accuracy_college_biology": 0.25, "mmlu_eval_accuracy_college_chemistry": 0.25, "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.18181818181818182, "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.3125, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.2857142857142857, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.28125, "mmlu_eval_accuracy_high_school_chemistry": 0.3181818181818182, "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, "mmlu_eval_accuracy_high_school_european_history": 0.6666666666666666, "mmlu_eval_accuracy_high_school_geography": 0.6363636363636364, "mmlu_eval_accuracy_high_school_government_and_politics": 0.42857142857142855, "mmlu_eval_accuracy_high_school_macroeconomics": 0.32558139534883723, "mmlu_eval_accuracy_high_school_mathematics": 0.27586206896551724, "mmlu_eval_accuracy_high_school_microeconomics": 0.23076923076923078, "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, "mmlu_eval_accuracy_high_school_psychology": 0.5666666666666667, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.5454545454545454, "mmlu_eval_accuracy_high_school_world_history": 0.38461538461538464, "mmlu_eval_accuracy_human_aging": 0.6086956521739131, "mmlu_eval_accuracy_human_sexuality": 0.3333333333333333, "mmlu_eval_accuracy_international_law": 0.6923076923076923, "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, "mmlu_eval_accuracy_logical_fallacies": 0.6111111111111112, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.45454545454545453, "mmlu_eval_accuracy_marketing": 0.84, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.5465116279069767, "mmlu_eval_accuracy_moral_disputes": 0.4473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.24, "mmlu_eval_accuracy_nutrition": 0.45454545454545453, "mmlu_eval_accuracy_philosophy": 0.5, "mmlu_eval_accuracy_prehistory": 0.2571428571428571, "mmlu_eval_accuracy_professional_accounting": 0.3870967741935484, "mmlu_eval_accuracy_professional_law": 0.34705882352941175, "mmlu_eval_accuracy_professional_medicine": 0.41935483870967744, "mmlu_eval_accuracy_professional_psychology": 0.391304347826087, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.4074074074074074, "mmlu_eval_accuracy_sociology": 0.5454545454545454, "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, "mmlu_eval_accuracy_virology": 0.4444444444444444, "mmlu_eval_accuracy_world_religions": 0.631578947368421, "mmlu_loss": 1.0847891762434152, "step": 3740 }, { "epoch": 1.1741535564361096, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5034, "step": 3741 }, { "epoch": 1.174467417317274, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.4662, "step": 3742 }, { "epoch": 1.1747812781984386, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.3567, "step": 3743 }, { "epoch": 1.175095139079603, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.4361, "step": 3744 }, { "epoch": 1.1754089999607673, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.4061, "step": 3745 }, { "epoch": 1.175722860841932, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.4657, "step": 3746 }, { "epoch": 1.1760367217230963, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.2493, "step": 3747 }, { "epoch": 1.1763505826042606, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.369, "step": 3748 }, { "epoch": 1.176664443485425, "grad_norm": 0.2021484375, "learning_rate": 0.0002, "loss": 1.6183, "step": 3749 }, { "epoch": 1.1769783043665896, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.326, "step": 3750 }, { "epoch": 1.177292165247754, "grad_norm": 0.2060546875, "learning_rate": 0.0002, "loss": 1.4562, "step": 3751 }, { "epoch": 1.1776060261289183, "grad_norm": 0.2236328125, "learning_rate": 0.0002, "loss": 1.3425, "step": 3752 }, { "epoch": 1.1779198870100829, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.2195, "step": 3753 }, { "epoch": 1.1782337478912472, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.2509, "step": 3754 }, { "epoch": 1.1785476087724116, "grad_norm": 0.203125, "learning_rate": 0.0002, "loss": 1.2863, "step": 3755 }, { "epoch": 1.178861469653576, "grad_norm": 0.21484375, "learning_rate": 0.0002, "loss": 1.3202, "step": 3756 }, { "epoch": 1.1791753305347406, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.4818, "step": 3757 }, { "epoch": 1.179489191415905, "grad_norm": 0.2109375, "learning_rate": 0.0002, "loss": 1.2737, "step": 3758 }, { "epoch": 1.1798030522970693, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.3306, "step": 3759 }, { "epoch": 1.1801169131782339, "grad_norm": 0.2451171875, "learning_rate": 0.0002, "loss": 1.3687, "step": 3760 }, { "epoch": 1.1804307740593982, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.5493, "step": 3761 }, { "epoch": 1.1807446349405626, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.6763, "step": 3762 }, { "epoch": 1.181058495821727, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 1.6181, "step": 3763 }, { "epoch": 1.1813723567028915, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 1.8927, "step": 3764 }, { "epoch": 1.181686217584056, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.9099, "step": 3765 }, { "epoch": 1.1820000784652203, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 2.0363, "step": 3766 }, { "epoch": 1.1823139393463846, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 1.8547, "step": 3767 }, { "epoch": 1.1826278002275492, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.9647, "step": 3768 }, { "epoch": 1.1829416611087136, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.7915, "step": 3769 }, { "epoch": 1.183255521989878, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 2.1384, "step": 3770 }, { "epoch": 1.1835693828710423, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 1.9578, "step": 3771 }, { "epoch": 1.1838832437522069, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 2.1569, "step": 3772 }, { "epoch": 1.1841971046333712, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 1.8285, "step": 3773 }, { "epoch": 1.1845109655145356, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 1.6542, "step": 3774 }, { "epoch": 1.1848248263957002, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 1.9014, "step": 3775 }, { "epoch": 1.1851386872768646, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.6977, "step": 3776 }, { "epoch": 1.185452548158029, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 1.8656, "step": 3777 }, { "epoch": 1.1857664090391933, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 1.8882, "step": 3778 }, { "epoch": 1.1860802699203579, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 2.0131, "step": 3779 }, { "epoch": 1.1863941308015222, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 2.326, "step": 3780 }, { "epoch": 1.1867079916826866, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 2.2844, "step": 3781 }, { "epoch": 1.1870218525638512, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 1.4935, "step": 3782 }, { "epoch": 1.1873357134450155, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.8096, "step": 3783 }, { "epoch": 1.18764957432618, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 1.9125, "step": 3784 }, { "epoch": 1.1879634352073443, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 2.1215, "step": 3785 }, { "epoch": 1.1882772960885088, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 2.6974, "step": 3786 }, { "epoch": 1.1885911569696732, "grad_norm": 0.2099609375, "learning_rate": 0.0002, "loss": 1.4455, "step": 3787 }, { "epoch": 1.1889050178508376, "grad_norm": 0.1328125, "learning_rate": 0.0002, "loss": 1.3066, "step": 3788 }, { "epoch": 1.1892188787320022, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.2273, "step": 3789 }, { "epoch": 1.1895327396131665, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.3491, "step": 3790 }, { "epoch": 1.1898466004943309, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.3615, "step": 3791 }, { "epoch": 1.1901604613754952, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.2648, "step": 3792 }, { "epoch": 1.1904743222566596, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5231, "step": 3793 }, { "epoch": 1.1907881831378242, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.4563, "step": 3794 }, { "epoch": 1.1911020440189886, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.3943, "step": 3795 }, { "epoch": 1.191415904900153, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.3072, "step": 3796 }, { "epoch": 1.1917297657813175, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.1707, "step": 3797 }, { "epoch": 1.1920436266624819, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.3971, "step": 3798 }, { "epoch": 1.1923574875436462, "grad_norm": 0.203125, "learning_rate": 0.0002, "loss": 1.4205, "step": 3799 }, { "epoch": 1.1926713484248106, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.2905, "step": 3800 }, { "epoch": 1.1929852093059752, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.2874, "step": 3801 }, { "epoch": 1.1932990701871395, "grad_norm": 0.21484375, "learning_rate": 0.0002, "loss": 1.4268, "step": 3802 }, { "epoch": 1.193612931068304, "grad_norm": 0.2021484375, "learning_rate": 0.0002, "loss": 1.2796, "step": 3803 }, { "epoch": 1.1939267919494685, "grad_norm": 0.189453125, "learning_rate": 0.0002, "loss": 1.3508, "step": 3804 }, { "epoch": 1.1942406528306329, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.2908, "step": 3805 }, { "epoch": 1.1945545137117972, "grad_norm": 0.212890625, "learning_rate": 0.0002, "loss": 1.3608, "step": 3806 }, { "epoch": 1.1948683745929616, "grad_norm": 0.1943359375, "learning_rate": 0.0002, "loss": 1.322, "step": 3807 }, { "epoch": 1.1951822354741262, "grad_norm": 0.2060546875, "learning_rate": 0.0002, "loss": 1.2416, "step": 3808 }, { "epoch": 1.1954960963552905, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.426, "step": 3809 }, { "epoch": 1.195809957236455, "grad_norm": 0.2158203125, "learning_rate": 0.0002, "loss": 1.4441, "step": 3810 }, { "epoch": 1.1961238181176195, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.4874, "step": 3811 }, { "epoch": 1.1964376789987838, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.6444, "step": 3812 }, { "epoch": 1.1967515398799482, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.3802, "step": 3813 }, { "epoch": 1.1970654007611126, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.4772, "step": 3814 }, { "epoch": 1.1973792616422771, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.7324, "step": 3815 }, { "epoch": 1.1976931225234415, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.5248, "step": 3816 }, { "epoch": 1.1980069834046059, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.651, "step": 3817 }, { "epoch": 1.1983208442857702, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.5842, "step": 3818 }, { "epoch": 1.1986347051669348, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.7978, "step": 3819 }, { "epoch": 1.1989485660480992, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 1.7602, "step": 3820 }, { "epoch": 1.1992624269292635, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 1.8248, "step": 3821 }, { "epoch": 1.199576287810428, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 1.8459, "step": 3822 }, { "epoch": 1.1998901486915925, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 1.8383, "step": 3823 }, { "epoch": 1.2002040095727569, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 1.9263, "step": 3824 }, { "epoch": 1.2005178704539212, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 2.1008, "step": 3825 }, { "epoch": 1.2008317313350858, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 1.9617, "step": 3826 }, { "epoch": 1.2011455922162502, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.8431, "step": 3827 }, { "epoch": 1.2014594530974145, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 2.1326, "step": 3828 }, { "epoch": 1.201773313978579, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 2.2094, "step": 3829 }, { "epoch": 1.2020871748597435, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 1.9739, "step": 3830 }, { "epoch": 1.2024010357409078, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 2.2287, "step": 3831 }, { "epoch": 1.2027148966220722, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 2.101, "step": 3832 }, { "epoch": 1.2030287575032368, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 1.8535, "step": 3833 }, { "epoch": 1.2033426183844012, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 1.7292, "step": 3834 }, { "epoch": 1.2036564792655655, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 2.4744, "step": 3835 }, { "epoch": 1.2039703401467299, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 2.7216, "step": 3836 }, { "epoch": 1.2042842010278945, "grad_norm": 0.2041015625, "learning_rate": 0.0002, "loss": 1.3469, "step": 3837 }, { "epoch": 1.2045980619090588, "grad_norm": 0.126953125, "learning_rate": 0.0002, "loss": 1.3098, "step": 3838 }, { "epoch": 1.2049119227902232, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.2628, "step": 3839 }, { "epoch": 1.2052257836713878, "grad_norm": 0.1328125, "learning_rate": 0.0002, "loss": 1.324, "step": 3840 }, { "epoch": 1.2055396445525521, "grad_norm": 0.1953125, "learning_rate": 0.0002, "loss": 1.3925, "step": 3841 }, { "epoch": 1.2058535054337165, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.3853, "step": 3842 }, { "epoch": 1.2061673663148809, "grad_norm": 0.1845703125, "learning_rate": 0.0002, "loss": 1.4041, "step": 3843 }, { "epoch": 1.2064812271960454, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.4943, "step": 3844 }, { "epoch": 1.2067950880772098, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.4522, "step": 3845 }, { "epoch": 1.2071089489583742, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.395, "step": 3846 }, { "epoch": 1.2074228098395385, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.4094, "step": 3847 }, { "epoch": 1.2077366707207031, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.2752, "step": 3848 }, { "epoch": 1.2080505316018675, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.3864, "step": 3849 }, { "epoch": 1.2083643924830318, "grad_norm": 0.189453125, "learning_rate": 0.0002, "loss": 1.3639, "step": 3850 }, { "epoch": 1.2086782533641962, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.3288, "step": 3851 }, { "epoch": 1.2089921142453608, "grad_norm": 0.1884765625, "learning_rate": 0.0002, "loss": 1.4141, "step": 3852 }, { "epoch": 1.2093059751265252, "grad_norm": 0.2021484375, "learning_rate": 0.0002, "loss": 1.3423, "step": 3853 }, { "epoch": 1.2096198360076895, "grad_norm": 0.216796875, "learning_rate": 0.0002, "loss": 1.3284, "step": 3854 }, { "epoch": 1.209933696888854, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 1.4631, "step": 3855 }, { "epoch": 1.2102475577700185, "grad_norm": 0.205078125, "learning_rate": 0.0002, "loss": 1.2618, "step": 3856 }, { "epoch": 1.2105614186511828, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.4483, "step": 3857 }, { "epoch": 1.2108752795323472, "grad_norm": 0.1982421875, "learning_rate": 0.0002, "loss": 1.4485, "step": 3858 }, { "epoch": 1.2111891404135118, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.4875, "step": 3859 }, { "epoch": 1.2115030012946761, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.4438, "step": 3860 }, { "epoch": 1.2118168621758405, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.4452, "step": 3861 }, { "epoch": 1.212130723057005, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.5181, "step": 3862 }, { "epoch": 1.2124445839381695, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 1.4851, "step": 3863 }, { "epoch": 1.2127584448193338, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 1.5521, "step": 3864 }, { "epoch": 1.2130723057004982, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 1.69, "step": 3865 }, { "epoch": 1.2133861665816628, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.7878, "step": 3866 }, { "epoch": 1.2137000274628271, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 1.5545, "step": 3867 }, { "epoch": 1.2140138883439915, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 1.5646, "step": 3868 }, { "epoch": 1.214327749225156, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 1.9537, "step": 3869 }, { "epoch": 1.2146416101063204, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 1.7254, "step": 3870 }, { "epoch": 1.2149554709874848, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.6222, "step": 3871 }, { "epoch": 1.2152693318686492, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 2.2117, "step": 3872 }, { "epoch": 1.2155831927498135, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 2.3514, "step": 3873 }, { "epoch": 1.215897053630978, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 1.8412, "step": 3874 }, { "epoch": 1.2162109145121425, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 2.1561, "step": 3875 }, { "epoch": 1.2165247753933068, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 2.0765, "step": 3876 }, { "epoch": 1.2168386362744714, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 1.8924, "step": 3877 }, { "epoch": 1.2171524971556358, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 2.2694, "step": 3878 }, { "epoch": 1.2174663580368001, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 2.0347, "step": 3879 }, { "epoch": 1.2177802189179645, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 1.9706, "step": 3880 }, { "epoch": 1.218094079799129, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 2.0052, "step": 3881 }, { "epoch": 1.2184079406802935, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 1.8245, "step": 3882 }, { "epoch": 1.2187218015614578, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 1.787, "step": 3883 }, { "epoch": 1.2190356624426224, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 1.7334, "step": 3884 }, { "epoch": 1.2193495233237868, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 1.853, "step": 3885 }, { "epoch": 1.2196633842049511, "grad_norm": 0.94140625, "learning_rate": 0.0002, "loss": 2.5601, "step": 3886 }, { "epoch": 1.2199772450861155, "grad_norm": 0.185546875, "learning_rate": 0.0002, "loss": 1.5006, "step": 3887 }, { "epoch": 1.22029110596728, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.3305, "step": 3888 }, { "epoch": 1.2206049668484444, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.3465, "step": 3889 }, { "epoch": 1.2209188277296088, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.4851, "step": 3890 }, { "epoch": 1.2212326886107734, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.3539, "step": 3891 }, { "epoch": 1.2215465494919377, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.486, "step": 3892 }, { "epoch": 1.2218604103731021, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.2972, "step": 3893 }, { "epoch": 1.2221742712542665, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.3355, "step": 3894 }, { "epoch": 1.222488132135431, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.3257, "step": 3895 }, { "epoch": 1.2228019930165954, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.4074, "step": 3896 }, { "epoch": 1.2231158538977598, "grad_norm": 0.2109375, "learning_rate": 0.0002, "loss": 1.3131, "step": 3897 }, { "epoch": 1.2234297147789241, "grad_norm": 0.189453125, "learning_rate": 0.0002, "loss": 1.3275, "step": 3898 }, { "epoch": 1.2237435756600887, "grad_norm": 0.203125, "learning_rate": 0.0002, "loss": 1.3617, "step": 3899 }, { "epoch": 1.224057436541253, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.3141, "step": 3900 }, { "epoch": 1.2243712974224175, "grad_norm": 0.205078125, "learning_rate": 0.0002, "loss": 1.2291, "step": 3901 }, { "epoch": 1.2246851583035818, "grad_norm": 0.2001953125, "learning_rate": 0.0002, "loss": 1.1175, "step": 3902 }, { "epoch": 1.2249990191847464, "grad_norm": 0.2216796875, "learning_rate": 0.0002, "loss": 1.5335, "step": 3903 }, { "epoch": 1.2253128800659108, "grad_norm": 0.2041015625, "learning_rate": 0.0002, "loss": 1.4631, "step": 3904 }, { "epoch": 1.2256267409470751, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.1809, "step": 3905 }, { "epoch": 1.2259406018282397, "grad_norm": 0.20703125, "learning_rate": 0.0002, "loss": 1.3521, "step": 3906 }, { "epoch": 1.226254462709404, "grad_norm": 0.24609375, "learning_rate": 0.0002, "loss": 1.1823, "step": 3907 }, { "epoch": 1.2265683235905684, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 1.3867, "step": 3908 }, { "epoch": 1.2268821844717328, "grad_norm": 0.2109375, "learning_rate": 0.0002, "loss": 1.3609, "step": 3909 }, { "epoch": 1.2271960453528974, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.4966, "step": 3910 }, { "epoch": 1.2275099062340618, "grad_norm": 0.2060546875, "learning_rate": 0.0002, "loss": 1.2672, "step": 3911 }, { "epoch": 1.2278237671152261, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.2476, "step": 3912 }, { "epoch": 1.2281376279963907, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.6099, "step": 3913 }, { "epoch": 1.228451488877555, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 1.5465, "step": 3914 }, { "epoch": 1.2287653497587194, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 1.7972, "step": 3915 }, { "epoch": 1.2290792106398838, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 1.8126, "step": 3916 }, { "epoch": 1.2293930715210484, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 1.6936, "step": 3917 }, { "epoch": 1.2297069324022127, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 1.853, "step": 3918 }, { "epoch": 1.230020793283377, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.8028, "step": 3919 }, { "epoch": 1.2303346541645417, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 1.4233, "step": 3920 }, { "epoch": 1.230648515045706, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.8537, "step": 3921 }, { "epoch": 1.2309623759268704, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 2.1938, "step": 3922 }, { "epoch": 1.2312762368080348, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.5935, "step": 3923 }, { "epoch": 1.2315900976891994, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 1.5907, "step": 3924 }, { "epoch": 1.2319039585703637, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.5777, "step": 3925 }, { "epoch": 1.232217819451528, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 2.1, "step": 3926 }, { "epoch": 1.2325316803326924, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.9012, "step": 3927 }, { "epoch": 1.2325316803326924, "eval_loss": 1.7945857048034668, "eval_runtime": 123.0461, "eval_samples_per_second": 8.127, "eval_steps_per_second": 8.127, "step": 3927 }, { "epoch": 1.2325316803326924, "mmlu_eval_accuracy": 0.3989423337967472, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.6363636363636364, "mmlu_eval_accuracy_clinical_knowledge": 0.41379310344827586, "mmlu_eval_accuracy_college_biology": 0.1875, "mmlu_eval_accuracy_college_chemistry": 0.25, "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.2727272727272727, "mmlu_eval_accuracy_conceptual_physics": 0.34615384615384615, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.375, "mmlu_eval_accuracy_elementary_mathematics": 0.34146341463414637, "mmlu_eval_accuracy_formal_logic": 0.35714285714285715, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.25, "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365, "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, "mmlu_eval_accuracy_high_school_european_history": 0.6666666666666666, "mmlu_eval_accuracy_high_school_geography": 0.5909090909090909, "mmlu_eval_accuracy_high_school_government_and_politics": 0.47619047619047616, "mmlu_eval_accuracy_high_school_macroeconomics": 0.37209302325581395, "mmlu_eval_accuracy_high_school_mathematics": 0.27586206896551724, "mmlu_eval_accuracy_high_school_microeconomics": 0.23076923076923078, "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, "mmlu_eval_accuracy_high_school_psychology": 0.55, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.5454545454545454, "mmlu_eval_accuracy_high_school_world_history": 0.4230769230769231, "mmlu_eval_accuracy_human_aging": 0.6086956521739131, "mmlu_eval_accuracy_human_sexuality": 0.3333333333333333, "mmlu_eval_accuracy_international_law": 0.6923076923076923, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.5555555555555556, "mmlu_eval_accuracy_machine_learning": 0.18181818181818182, "mmlu_eval_accuracy_management": 0.45454545454545453, "mmlu_eval_accuracy_marketing": 0.8, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.5116279069767442, "mmlu_eval_accuracy_moral_disputes": 0.3684210526315789, "mmlu_eval_accuracy_moral_scenarios": 0.29, "mmlu_eval_accuracy_nutrition": 0.48484848484848486, "mmlu_eval_accuracy_philosophy": 0.4117647058823529, "mmlu_eval_accuracy_prehistory": 0.2857142857142857, "mmlu_eval_accuracy_professional_accounting": 0.4838709677419355, "mmlu_eval_accuracy_professional_law": 0.3235294117647059, "mmlu_eval_accuracy_professional_medicine": 0.3870967741935484, "mmlu_eval_accuracy_professional_psychology": 0.4057971014492754, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.5, "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, "mmlu_eval_accuracy_virology": 0.4444444444444444, "mmlu_eval_accuracy_world_religions": 0.631578947368421, "mmlu_loss": 1.1527687325187947, "step": 3927 }, { "epoch": 1.232845541213857, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 1.9394, "step": 3928 }, { "epoch": 1.2331594020950214, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 1.5628, "step": 3929 }, { "epoch": 1.2334732629761858, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 2.0467, "step": 3930 }, { "epoch": 1.2337871238573501, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 2.5013, "step": 3931 }, { "epoch": 1.2341009847385147, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 1.7232, "step": 3932 }, { "epoch": 1.234414845619679, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 2.0227, "step": 3933 }, { "epoch": 1.2347287065008434, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.9215, "step": 3934 }, { "epoch": 1.235042567382008, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 2.0873, "step": 3935 }, { "epoch": 1.2353564282631724, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 2.3414, "step": 3936 }, { "epoch": 1.2356702891443367, "grad_norm": 0.2314453125, "learning_rate": 0.0002, "loss": 1.5739, "step": 3937 }, { "epoch": 1.235984150025501, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.3787, "step": 3938 }, { "epoch": 1.2362980109066657, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.4379, "step": 3939 }, { "epoch": 1.23661187178783, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5092, "step": 3940 }, { "epoch": 1.2369257326689944, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.2708, "step": 3941 }, { "epoch": 1.237239593550159, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.3394, "step": 3942 }, { "epoch": 1.2375534544313234, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.2485, "step": 3943 }, { "epoch": 1.2378673153124877, "grad_norm": 0.19921875, "learning_rate": 0.0002, "loss": 1.2642, "step": 3944 }, { "epoch": 1.238181176193652, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.2613, "step": 3945 }, { "epoch": 1.2384950370748167, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.3982, "step": 3946 }, { "epoch": 1.238808897955981, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5133, "step": 3947 }, { "epoch": 1.2391227588371454, "grad_norm": 0.1875, "learning_rate": 0.0002, "loss": 1.3276, "step": 3948 }, { "epoch": 1.2394366197183098, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.2339, "step": 3949 }, { "epoch": 1.2397504805994743, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.5666, "step": 3950 }, { "epoch": 1.2400643414806387, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.4242, "step": 3951 }, { "epoch": 1.240378202361803, "grad_norm": 0.205078125, "learning_rate": 0.0002, "loss": 1.524, "step": 3952 }, { "epoch": 1.2406920632429674, "grad_norm": 0.2099609375, "learning_rate": 0.0002, "loss": 1.4408, "step": 3953 }, { "epoch": 1.241005924124132, "grad_norm": 0.1943359375, "learning_rate": 0.0002, "loss": 1.3607, "step": 3954 }, { "epoch": 1.2413197850052964, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.3292, "step": 3955 }, { "epoch": 1.2416336458864607, "grad_norm": 0.2392578125, "learning_rate": 0.0002, "loss": 1.3111, "step": 3956 }, { "epoch": 1.2419475067676253, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.5065, "step": 3957 }, { "epoch": 1.2422613676487897, "grad_norm": 0.2265625, "learning_rate": 0.0002, "loss": 1.4442, "step": 3958 }, { "epoch": 1.242575228529954, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.6287, "step": 3959 }, { "epoch": 1.2428890894111184, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.4796, "step": 3960 }, { "epoch": 1.243202950292283, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.3231, "step": 3961 }, { "epoch": 1.2435168111734474, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.4953, "step": 3962 }, { "epoch": 1.2438306720546117, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.4468, "step": 3963 }, { "epoch": 1.2441445329357763, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 1.7283, "step": 3964 }, { "epoch": 1.2444583938169407, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 1.7535, "step": 3965 }, { "epoch": 1.244772254698105, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.2766, "step": 3966 }, { "epoch": 1.2450861155792694, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 1.5373, "step": 3967 }, { "epoch": 1.245399976460434, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 1.6022, "step": 3968 }, { "epoch": 1.2457138373415984, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 1.5153, "step": 3969 }, { "epoch": 1.2460276982227627, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 1.7013, "step": 3970 }, { "epoch": 1.2463415591039273, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 2.0935, "step": 3971 }, { "epoch": 1.2466554199850917, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.8361, "step": 3972 }, { "epoch": 1.246969280866256, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 2.0163, "step": 3973 }, { "epoch": 1.2472831417474204, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 1.9271, "step": 3974 }, { "epoch": 1.247597002628585, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.6085, "step": 3975 }, { "epoch": 1.2479108635097493, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 2.0353, "step": 3976 }, { "epoch": 1.2482247243909137, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 2.0623, "step": 3977 }, { "epoch": 1.248538585272078, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 1.6417, "step": 3978 }, { "epoch": 1.2488524461532426, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 1.8706, "step": 3979 }, { "epoch": 1.249166307034407, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 2.0887, "step": 3980 }, { "epoch": 1.2494801679155714, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 2.0637, "step": 3981 }, { "epoch": 1.2497940287967357, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 1.684, "step": 3982 }, { "epoch": 1.2501078896779003, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 1.6825, "step": 3983 }, { "epoch": 1.2504217505590647, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.5943, "step": 3984 }, { "epoch": 1.250735611440229, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 2.3663, "step": 3985 }, { "epoch": 1.2510494723213936, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 2.5565, "step": 3986 }, { "epoch": 1.251363333202558, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.5542, "step": 3987 }, { "epoch": 1.2516771940837224, "grad_norm": 0.10400390625, "learning_rate": 0.0002, "loss": 1.0363, "step": 3988 }, { "epoch": 1.2519910549648867, "grad_norm": 0.185546875, "learning_rate": 0.0002, "loss": 1.3929, "step": 3989 }, { "epoch": 1.2523049158460513, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.324, "step": 3990 }, { "epoch": 1.2526187767272157, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.413, "step": 3991 }, { "epoch": 1.25293263760838, "grad_norm": 0.2119140625, "learning_rate": 0.0002, "loss": 1.4615, "step": 3992 }, { "epoch": 1.2532464984895446, "grad_norm": 0.1875, "learning_rate": 0.0002, "loss": 1.3599, "step": 3993 }, { "epoch": 1.253560359370709, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.3142, "step": 3994 }, { "epoch": 1.2538742202518733, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.4337, "step": 3995 }, { "epoch": 1.2541880811330377, "grad_norm": 0.1923828125, "learning_rate": 0.0002, "loss": 1.3202, "step": 3996 }, { "epoch": 1.2545019420142023, "grad_norm": 0.2109375, "learning_rate": 0.0002, "loss": 1.3973, "step": 3997 }, { "epoch": 1.2548158028953666, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.3098, "step": 3998 }, { "epoch": 1.255129663776531, "grad_norm": 0.197265625, "learning_rate": 0.0002, "loss": 1.2877, "step": 3999 }, { "epoch": 1.2554435246576956, "grad_norm": 0.205078125, "learning_rate": 0.0002, "loss": 1.3905, "step": 4000 }, { "epoch": 1.25575738553886, "grad_norm": 0.220703125, "learning_rate": 0.0002, "loss": 1.4616, "step": 4001 }, { "epoch": 1.2560712464200243, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.4432, "step": 4002 }, { "epoch": 1.2563851073011887, "grad_norm": 0.2119140625, "learning_rate": 0.0002, "loss": 1.2567, "step": 4003 }, { "epoch": 1.256698968182353, "grad_norm": 0.2001953125, "learning_rate": 0.0002, "loss": 1.3767, "step": 4004 }, { "epoch": 1.2570128290635176, "grad_norm": 0.22265625, "learning_rate": 0.0002, "loss": 1.4998, "step": 4005 }, { "epoch": 1.257326689944682, "grad_norm": 0.1982421875, "learning_rate": 0.0002, "loss": 1.332, "step": 4006 }, { "epoch": 1.2576405508258466, "grad_norm": 0.228515625, "learning_rate": 0.0002, "loss": 1.3372, "step": 4007 }, { "epoch": 1.257954411707011, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.5853, "step": 4008 }, { "epoch": 1.2582682725881753, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.2254, "step": 4009 }, { "epoch": 1.2585821334693397, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.6269, "step": 4010 }, { "epoch": 1.258895994350504, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.6903, "step": 4011 }, { "epoch": 1.2592098552316686, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.6119, "step": 4012 }, { "epoch": 1.259523716112833, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 1.6418, "step": 4013 }, { "epoch": 1.2598375769939973, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.6182, "step": 4014 }, { "epoch": 1.260151437875162, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.2947, "step": 4015 }, { "epoch": 1.2604652987563263, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 1.6339, "step": 4016 }, { "epoch": 1.2607791596374907, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 1.7531, "step": 4017 }, { "epoch": 1.261093020518655, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 2.2208, "step": 4018 }, { "epoch": 1.2614068813998196, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.9273, "step": 4019 }, { "epoch": 1.261720742280984, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 2.0857, "step": 4020 }, { "epoch": 1.2620346031621483, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 1.8421, "step": 4021 }, { "epoch": 1.262348464043313, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.8256, "step": 4022 }, { "epoch": 1.2626623249244773, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 1.8785, "step": 4023 }, { "epoch": 1.2629761858056416, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 2.2222, "step": 4024 }, { "epoch": 1.263290046686806, "grad_norm": 0.984375, "learning_rate": 0.0002, "loss": 2.7284, "step": 4025 }, { "epoch": 1.2636039075679704, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 2.0976, "step": 4026 }, { "epoch": 1.263917768449135, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 1.9994, "step": 4027 }, { "epoch": 1.2642316293302993, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 2.2444, "step": 4028 }, { "epoch": 1.264545490211464, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 1.826, "step": 4029 }, { "epoch": 1.2648593510926283, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 1.9897, "step": 4030 }, { "epoch": 1.2651732119737926, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 1.7658, "step": 4031 }, { "epoch": 1.265487072854957, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 1.3832, "step": 4032 }, { "epoch": 1.2658009337361213, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.6482, "step": 4033 }, { "epoch": 1.266114794617286, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 1.6645, "step": 4034 }, { "epoch": 1.2664286554984503, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.5989, "step": 4035 }, { "epoch": 1.2667425163796147, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 2.2411, "step": 4036 }, { "epoch": 1.2670563772607792, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.4773, "step": 4037 }, { "epoch": 1.2673702381419436, "grad_norm": 0.1083984375, "learning_rate": 0.0002, "loss": 1.1318, "step": 4038 }, { "epoch": 1.267684099023108, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.3812, "step": 4039 }, { "epoch": 1.2679979599042723, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.3225, "step": 4040 }, { "epoch": 1.268311820785437, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.2517, "step": 4041 }, { "epoch": 1.2686256816666013, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.3175, "step": 4042 }, { "epoch": 1.2689395425477656, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.4699, "step": 4043 }, { "epoch": 1.2692534034289302, "grad_norm": 0.212890625, "learning_rate": 0.0002, "loss": 1.3492, "step": 4044 }, { "epoch": 1.2695672643100946, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.2974, "step": 4045 }, { "epoch": 1.269881125191259, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.3531, "step": 4046 }, { "epoch": 1.2701949860724233, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.4536, "step": 4047 }, { "epoch": 1.270508846953588, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.2499, "step": 4048 }, { "epoch": 1.2708227078347523, "grad_norm": 0.2333984375, "learning_rate": 0.0002, "loss": 1.3348, "step": 4049 }, { "epoch": 1.2711365687159166, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.3379, "step": 4050 }, { "epoch": 1.2714504295970812, "grad_norm": 0.185546875, "learning_rate": 0.0002, "loss": 1.4797, "step": 4051 }, { "epoch": 1.2717642904782456, "grad_norm": 0.220703125, "learning_rate": 0.0002, "loss": 1.3381, "step": 4052 }, { "epoch": 1.27207815135941, "grad_norm": 0.197265625, "learning_rate": 0.0002, "loss": 1.4228, "step": 4053 }, { "epoch": 1.2723920122405743, "grad_norm": 0.2373046875, "learning_rate": 0.0002, "loss": 1.4404, "step": 4054 }, { "epoch": 1.2727058731217387, "grad_norm": 0.2314453125, "learning_rate": 0.0002, "loss": 1.4416, "step": 4055 }, { "epoch": 1.2730197340029032, "grad_norm": 0.208984375, "learning_rate": 0.0002, "loss": 1.3466, "step": 4056 }, { "epoch": 1.2733335948840676, "grad_norm": 0.2451171875, "learning_rate": 0.0002, "loss": 1.3853, "step": 4057 }, { "epoch": 1.2736474557652322, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.3325, "step": 4058 }, { "epoch": 1.2739613166463966, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.5336, "step": 4059 }, { "epoch": 1.274275177527561, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.3639, "step": 4060 }, { "epoch": 1.2745890384087253, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.5649, "step": 4061 }, { "epoch": 1.2749028992898896, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.4593, "step": 4062 }, { "epoch": 1.2752167601710542, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 1.5844, "step": 4063 }, { "epoch": 1.2755306210522186, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.4457, "step": 4064 }, { "epoch": 1.275844481933383, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.7896, "step": 4065 }, { "epoch": 1.2761583428145475, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 1.6183, "step": 4066 }, { "epoch": 1.276472203695712, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 1.4078, "step": 4067 }, { "epoch": 1.2767860645768763, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.736, "step": 4068 }, { "epoch": 1.2770999254580406, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 1.7595, "step": 4069 }, { "epoch": 1.2774137863392052, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 2.2398, "step": 4070 }, { "epoch": 1.2777276472203696, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 1.6555, "step": 4071 }, { "epoch": 1.278041508101534, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 2.4183, "step": 4072 }, { "epoch": 1.2783553689826985, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 1.6922, "step": 4073 }, { "epoch": 1.2786692298638629, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.9127, "step": 4074 }, { "epoch": 1.2789830907450273, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 2.0626, "step": 4075 }, { "epoch": 1.2792969516261916, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 1.9806, "step": 4076 }, { "epoch": 1.279610812507356, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 2.1743, "step": 4077 }, { "epoch": 1.2799246733885206, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 2.2794, "step": 4078 }, { "epoch": 1.280238534269685, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 1.6562, "step": 4079 }, { "epoch": 1.2805523951508495, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 2.0937, "step": 4080 }, { "epoch": 1.2808662560320139, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 1.783, "step": 4081 }, { "epoch": 1.2811801169131782, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.6448, "step": 4082 }, { "epoch": 1.2814939777943426, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 1.6891, "step": 4083 }, { "epoch": 1.281807838675507, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 2.092, "step": 4084 }, { "epoch": 1.2821216995566715, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 1.905, "step": 4085 }, { "epoch": 1.282435560437836, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 2.2499, "step": 4086 }, { "epoch": 1.2827494213190003, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 1.5193, "step": 4087 }, { "epoch": 1.2830632822001649, "grad_norm": 0.1298828125, "learning_rate": 0.0002, "loss": 1.3801, "step": 4088 }, { "epoch": 1.2833771430813292, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.273, "step": 4089 }, { "epoch": 1.2836910039624936, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.4607, "step": 4090 }, { "epoch": 1.284004864843658, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.4262, "step": 4091 }, { "epoch": 1.2843187257248225, "grad_norm": 0.24609375, "learning_rate": 0.0002, "loss": 1.4388, "step": 4092 }, { "epoch": 1.284632586605987, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.1754, "step": 4093 }, { "epoch": 1.2849464474871513, "grad_norm": 0.20703125, "learning_rate": 0.0002, "loss": 1.3231, "step": 4094 }, { "epoch": 1.2852603083683158, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.1401, "step": 4095 }, { "epoch": 1.2855741692494802, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.2105, "step": 4096 }, { "epoch": 1.2858880301306446, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.1321, "step": 4097 }, { "epoch": 1.286201891011809, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.3177, "step": 4098 }, { "epoch": 1.2865157518929735, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.4483, "step": 4099 }, { "epoch": 1.2868296127741379, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.3236, "step": 4100 }, { "epoch": 1.2871434736553022, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.3283, "step": 4101 }, { "epoch": 1.2874573345364668, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.4345, "step": 4102 }, { "epoch": 1.2877711954176312, "grad_norm": 0.2177734375, "learning_rate": 0.0002, "loss": 1.4472, "step": 4103 }, { "epoch": 1.2880850562987955, "grad_norm": 0.1865234375, "learning_rate": 0.0002, "loss": 1.2968, "step": 4104 }, { "epoch": 1.28839891717996, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.4617, "step": 4105 }, { "epoch": 1.2887127780611243, "grad_norm": 0.1884765625, "learning_rate": 0.0002, "loss": 1.1371, "step": 4106 }, { "epoch": 1.2890266389422889, "grad_norm": 0.2255859375, "learning_rate": 0.0002, "loss": 1.4406, "step": 4107 }, { "epoch": 1.2893404998234532, "grad_norm": 0.2275390625, "learning_rate": 0.0002, "loss": 1.4313, "step": 4108 }, { "epoch": 1.2896543607046178, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.5082, "step": 4109 }, { "epoch": 1.2899682215857822, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.613, "step": 4110 }, { "epoch": 1.2902820824669465, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.4634, "step": 4111 }, { "epoch": 1.290595943348111, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.3694, "step": 4112 }, { "epoch": 1.2909098042292753, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.4812, "step": 4113 }, { "epoch": 1.2912236651104398, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.5116, "step": 4114 }, { "epoch": 1.2912236651104398, "eval_loss": 1.781996250152588, "eval_runtime": 123.3814, "eval_samples_per_second": 8.105, "eval_steps_per_second": 8.105, "step": 4114 }, { "epoch": 1.2912236651104398, "mmlu_eval_accuracy": 0.4049450626432859, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.4375, "mmlu_eval_accuracy_business_ethics": 0.6363636363636364, "mmlu_eval_accuracy_clinical_knowledge": 0.3793103448275862, "mmlu_eval_accuracy_college_biology": 0.1875, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.18181818181818182, "mmlu_eval_accuracy_conceptual_physics": 0.34615384615384615, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.3125, "mmlu_eval_accuracy_elementary_mathematics": 0.24390243902439024, "mmlu_eval_accuracy_formal_logic": 0.35714285714285715, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.34375, "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365, "mmlu_eval_accuracy_high_school_computer_science": 0.4444444444444444, "mmlu_eval_accuracy_high_school_european_history": 0.6111111111111112, "mmlu_eval_accuracy_high_school_geography": 0.5909090909090909, "mmlu_eval_accuracy_high_school_government_and_politics": 0.42857142857142855, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3488372093023256, "mmlu_eval_accuracy_high_school_mathematics": 0.27586206896551724, "mmlu_eval_accuracy_high_school_microeconomics": 0.2692307692307692, "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, "mmlu_eval_accuracy_high_school_psychology": 0.5666666666666667, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.5454545454545454, "mmlu_eval_accuracy_high_school_world_history": 0.5, "mmlu_eval_accuracy_human_aging": 0.6521739130434783, "mmlu_eval_accuracy_human_sexuality": 0.4166666666666667, "mmlu_eval_accuracy_international_law": 0.6923076923076923, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.5555555555555556, "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, "mmlu_eval_accuracy_management": 0.45454545454545453, "mmlu_eval_accuracy_marketing": 0.76, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.5232558139534884, "mmlu_eval_accuracy_moral_disputes": 0.39473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.3, "mmlu_eval_accuracy_nutrition": 0.48484848484848486, "mmlu_eval_accuracy_philosophy": 0.47058823529411764, "mmlu_eval_accuracy_prehistory": 0.3142857142857143, "mmlu_eval_accuracy_professional_accounting": 0.45161290322580644, "mmlu_eval_accuracy_professional_law": 0.3176470588235294, "mmlu_eval_accuracy_professional_medicine": 0.45161290322580644, "mmlu_eval_accuracy_professional_psychology": 0.391304347826087, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.4074074074074074, "mmlu_eval_accuracy_sociology": 0.5, "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, "mmlu_eval_accuracy_virology": 0.4444444444444444, "mmlu_eval_accuracy_world_religions": 0.631578947368421, "mmlu_loss": 1.2858767965741598, "step": 4114 }, { "epoch": 1.2915375259916042, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.8054, "step": 4115 }, { "epoch": 1.2918513868727686, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.6913, "step": 4116 }, { "epoch": 1.2921652477539332, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 1.6834, "step": 4117 }, { "epoch": 1.2924791086350975, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 1.7491, "step": 4118 }, { "epoch": 1.2927929695162619, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 1.9833, "step": 4119 }, { "epoch": 1.2931068303974262, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 1.7453, "step": 4120 }, { "epoch": 1.2934206912785908, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 2.0319, "step": 4121 }, { "epoch": 1.2937345521597552, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.905, "step": 4122 }, { "epoch": 1.2940484130409196, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 2.3193, "step": 4123 }, { "epoch": 1.2943622739220841, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 1.8523, "step": 4124 }, { "epoch": 1.2946761348032485, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 1.8196, "step": 4125 }, { "epoch": 1.2949899956844129, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 1.7706, "step": 4126 }, { "epoch": 1.2953038565655772, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 1.8172, "step": 4127 }, { "epoch": 1.2956177174467418, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.6792, "step": 4128 }, { "epoch": 1.2959315783279062, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 2.4328, "step": 4129 }, { "epoch": 1.2962454392090705, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 2.1643, "step": 4130 }, { "epoch": 1.2965593000902351, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 1.7205, "step": 4131 }, { "epoch": 1.2968731609713995, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 2.1623, "step": 4132 }, { "epoch": 1.2971870218525638, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 1.8202, "step": 4133 }, { "epoch": 1.2975008827337282, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 1.5895, "step": 4134 }, { "epoch": 1.2978147436148926, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 2.074, "step": 4135 }, { "epoch": 1.2981286044960572, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 2.431, "step": 4136 }, { "epoch": 1.2984424653772215, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.5394, "step": 4137 }, { "epoch": 1.298756326258386, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.3751, "step": 4138 }, { "epoch": 1.2990701871395505, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.3041, "step": 4139 }, { "epoch": 1.2993840480207148, "grad_norm": 0.1318359375, "learning_rate": 0.0002, "loss": 1.2854, "step": 4140 }, { "epoch": 1.2996979089018792, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.3936, "step": 4141 }, { "epoch": 1.3000117697830436, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.4258, "step": 4142 }, { "epoch": 1.3003256306642081, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.1228, "step": 4143 }, { "epoch": 1.3006394915453725, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.4088, "step": 4144 }, { "epoch": 1.3009533524265369, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.257, "step": 4145 }, { "epoch": 1.3012672133077015, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.3949, "step": 4146 }, { "epoch": 1.3015810741888658, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.3965, "step": 4147 }, { "epoch": 1.3018949350700302, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 1.2891, "step": 4148 }, { "epoch": 1.3022087959511945, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.2491, "step": 4149 }, { "epoch": 1.3025226568323591, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.3742, "step": 4150 }, { "epoch": 1.3028365177135235, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.3521, "step": 4151 }, { "epoch": 1.3031503785946879, "grad_norm": 0.1845703125, "learning_rate": 0.0002, "loss": 1.3397, "step": 4152 }, { "epoch": 1.3034642394758524, "grad_norm": 0.2470703125, "learning_rate": 0.0002, "loss": 1.3382, "step": 4153 }, { "epoch": 1.3037781003570168, "grad_norm": 0.22265625, "learning_rate": 0.0002, "loss": 1.415, "step": 4154 }, { "epoch": 1.3040919612381812, "grad_norm": 0.224609375, "learning_rate": 0.0002, "loss": 1.2975, "step": 4155 }, { "epoch": 1.3044058221193455, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.2389, "step": 4156 }, { "epoch": 1.3047196830005099, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.2805, "step": 4157 }, { "epoch": 1.3050335438816745, "grad_norm": 0.2392578125, "learning_rate": 0.0002, "loss": 1.3648, "step": 4158 }, { "epoch": 1.3053474047628388, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.5424, "step": 4159 }, { "epoch": 1.3056612656440034, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 1.5212, "step": 4160 }, { "epoch": 1.3059751265251678, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.3956, "step": 4161 }, { "epoch": 1.3062889874063321, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.5542, "step": 4162 }, { "epoch": 1.3066028482874965, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 1.4505, "step": 4163 }, { "epoch": 1.3069167091686609, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 1.8976, "step": 4164 }, { "epoch": 1.3072305700498255, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 1.5576, "step": 4165 }, { "epoch": 1.3075444309309898, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 1.588, "step": 4166 }, { "epoch": 1.3078582918121542, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 1.5278, "step": 4167 }, { "epoch": 1.3081721526933188, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 2.1306, "step": 4168 }, { "epoch": 1.3084860135744831, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 1.8992, "step": 4169 }, { "epoch": 1.3087998744556475, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 2.1338, "step": 4170 }, { "epoch": 1.3091137353368119, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 1.6033, "step": 4171 }, { "epoch": 1.3094275962179764, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 2.4472, "step": 4172 }, { "epoch": 1.3097414570991408, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 1.4998, "step": 4173 }, { "epoch": 1.3100553179803052, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 2.0096, "step": 4174 }, { "epoch": 1.3103691788614698, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 2.1268, "step": 4175 }, { "epoch": 1.3106830397426341, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 1.8187, "step": 4176 }, { "epoch": 1.3109969006237985, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 1.6616, "step": 4177 }, { "epoch": 1.3113107615049628, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 2.2469, "step": 4178 }, { "epoch": 1.3116246223861274, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 2.246, "step": 4179 }, { "epoch": 1.3119384832672918, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 2.279, "step": 4180 }, { "epoch": 1.3122523441484562, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.8124, "step": 4181 }, { "epoch": 1.3125662050296207, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 1.6443, "step": 4182 }, { "epoch": 1.312880065910785, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 1.5571, "step": 4183 }, { "epoch": 1.3131939267919495, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 1.577, "step": 4184 }, { "epoch": 1.3135077876731138, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 1.8116, "step": 4185 }, { "epoch": 1.3138216485542782, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 2.5815, "step": 4186 }, { "epoch": 1.3141355094354428, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.5853, "step": 4187 }, { "epoch": 1.3144493703166071, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.4784, "step": 4188 }, { "epoch": 1.3147632311977717, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.3305, "step": 4189 }, { "epoch": 1.315077092078936, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.253, "step": 4190 }, { "epoch": 1.3153909529601004, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.2798, "step": 4191 }, { "epoch": 1.3157048138412648, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.273, "step": 4192 }, { "epoch": 1.3160186747224292, "grad_norm": 0.19140625, "learning_rate": 0.0002, "loss": 1.3309, "step": 4193 }, { "epoch": 1.3163325356035938, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.4338, "step": 4194 }, { "epoch": 1.3166463964847581, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.2822, "step": 4195 }, { "epoch": 1.3169602573659225, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.2953, "step": 4196 }, { "epoch": 1.317274118247087, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.3072, "step": 4197 }, { "epoch": 1.3175879791282514, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.3306, "step": 4198 }, { "epoch": 1.3179018400094158, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.361, "step": 4199 }, { "epoch": 1.3182157008905802, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.2953, "step": 4200 }, { "epoch": 1.3185295617717447, "grad_norm": 0.189453125, "learning_rate": 0.0002, "loss": 1.3355, "step": 4201 }, { "epoch": 1.318843422652909, "grad_norm": 0.220703125, "learning_rate": 0.0002, "loss": 1.2873, "step": 4202 }, { "epoch": 1.3191572835340735, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.3111, "step": 4203 }, { "epoch": 1.319471144415238, "grad_norm": 0.1923828125, "learning_rate": 0.0002, "loss": 1.2623, "step": 4204 }, { "epoch": 1.3197850052964024, "grad_norm": 0.2109375, "learning_rate": 0.0002, "loss": 1.3187, "step": 4205 }, { "epoch": 1.3200988661775668, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.2645, "step": 4206 }, { "epoch": 1.3204127270587311, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.5755, "step": 4207 }, { "epoch": 1.3207265879398957, "grad_norm": 0.2392578125, "learning_rate": 0.0002, "loss": 1.4094, "step": 4208 }, { "epoch": 1.32104044882106, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.5062, "step": 4209 }, { "epoch": 1.3213543097022244, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.5129, "step": 4210 }, { "epoch": 1.321668170583389, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.4351, "step": 4211 }, { "epoch": 1.3219820314645534, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 1.5621, "step": 4212 }, { "epoch": 1.3222958923457178, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.2832, "step": 4213 }, { "epoch": 1.3226097532268821, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.686, "step": 4214 }, { "epoch": 1.3229236141080465, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 1.8661, "step": 4215 }, { "epoch": 1.323237474989211, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.4074, "step": 4216 }, { "epoch": 1.3235513358703754, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 1.8661, "step": 4217 }, { "epoch": 1.3238651967515398, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 1.6044, "step": 4218 }, { "epoch": 1.3241790576327044, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.6835, "step": 4219 }, { "epoch": 1.3244929185138687, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.7726, "step": 4220 }, { "epoch": 1.324806779395033, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 1.81, "step": 4221 }, { "epoch": 1.3251206402761975, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.9177, "step": 4222 }, { "epoch": 1.325434501157362, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 1.8793, "step": 4223 }, { "epoch": 1.3257483620385264, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 1.9163, "step": 4224 }, { "epoch": 1.3260622229196908, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.823, "step": 4225 }, { "epoch": 1.3263760838008554, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.8248, "step": 4226 }, { "epoch": 1.3266899446820197, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 2.4049, "step": 4227 }, { "epoch": 1.327003805563184, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 2.1969, "step": 4228 }, { "epoch": 1.3273176664443485, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 2.1375, "step": 4229 }, { "epoch": 1.327631527325513, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 2.0477, "step": 4230 }, { "epoch": 1.3279453882066774, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 1.9783, "step": 4231 }, { "epoch": 1.3282592490878418, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 1.9565, "step": 4232 }, { "epoch": 1.3285731099690063, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 2.3455, "step": 4233 }, { "epoch": 1.3288869708501707, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.9415, "step": 4234 }, { "epoch": 1.329200831731335, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 1.5521, "step": 4235 }, { "epoch": 1.3295146926124994, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 2.1808, "step": 4236 }, { "epoch": 1.3298285534936638, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.6169, "step": 4237 }, { "epoch": 1.3301424143748284, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.2474, "step": 4238 }, { "epoch": 1.3304562752559927, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.3209, "step": 4239 }, { "epoch": 1.3307701361371573, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.4461, "step": 4240 }, { "epoch": 1.3310839970183217, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.3047, "step": 4241 }, { "epoch": 1.331397857899486, "grad_norm": 0.19921875, "learning_rate": 0.0002, "loss": 1.3713, "step": 4242 }, { "epoch": 1.3317117187806504, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.2495, "step": 4243 }, { "epoch": 1.3320255796618148, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.3678, "step": 4244 }, { "epoch": 1.3323394405429794, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.3047, "step": 4245 }, { "epoch": 1.3326533014241437, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.3341, "step": 4246 }, { "epoch": 1.332967162305308, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.4346, "step": 4247 }, { "epoch": 1.3332810231864727, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.1887, "step": 4248 }, { "epoch": 1.333594884067637, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.223, "step": 4249 }, { "epoch": 1.3339087449488014, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.2228, "step": 4250 }, { "epoch": 1.3342226058299658, "grad_norm": 0.224609375, "learning_rate": 0.0002, "loss": 1.2542, "step": 4251 }, { "epoch": 1.3345364667111304, "grad_norm": 0.212890625, "learning_rate": 0.0002, "loss": 1.4315, "step": 4252 }, { "epoch": 1.3348503275922947, "grad_norm": 0.2041015625, "learning_rate": 0.0002, "loss": 1.357, "step": 4253 }, { "epoch": 1.335164188473459, "grad_norm": 0.234375, "learning_rate": 0.0002, "loss": 1.487, "step": 4254 }, { "epoch": 1.3354780493546237, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.3971, "step": 4255 }, { "epoch": 1.335791910235788, "grad_norm": 0.193359375, "learning_rate": 0.0002, "loss": 1.3167, "step": 4256 }, { "epoch": 1.3361057711169524, "grad_norm": 0.228515625, "learning_rate": 0.0002, "loss": 1.3869, "step": 4257 }, { "epoch": 1.3364196319981168, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 1.4561, "step": 4258 }, { "epoch": 1.3367334928792813, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.4991, "step": 4259 }, { "epoch": 1.3370473537604457, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.4555, "step": 4260 }, { "epoch": 1.33736121464161, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.3808, "step": 4261 }, { "epoch": 1.3376750755227746, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.8497, "step": 4262 }, { "epoch": 1.337988936403939, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.3665, "step": 4263 }, { "epoch": 1.3383027972851034, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 1.6702, "step": 4264 }, { "epoch": 1.3386166581662677, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 1.8136, "step": 4265 }, { "epoch": 1.338930519047432, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 1.9163, "step": 4266 }, { "epoch": 1.3392443799285967, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 1.6328, "step": 4267 }, { "epoch": 1.339558240809761, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 1.8287, "step": 4268 }, { "epoch": 1.3398721016909256, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.7631, "step": 4269 }, { "epoch": 1.34018596257209, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.9577, "step": 4270 }, { "epoch": 1.3404998234532544, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 1.6557, "step": 4271 }, { "epoch": 1.3408136843344187, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 1.4754, "step": 4272 }, { "epoch": 1.341127545215583, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 1.9046, "step": 4273 }, { "epoch": 1.3414414060967477, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 1.872, "step": 4274 }, { "epoch": 1.341755266977912, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.6745, "step": 4275 }, { "epoch": 1.3420691278590764, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.8666, "step": 4276 }, { "epoch": 1.342382988740241, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 2.2783, "step": 4277 }, { "epoch": 1.3426968496214053, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 1.5121, "step": 4278 }, { "epoch": 1.3430107105025697, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 1.7287, "step": 4279 }, { "epoch": 1.343324571383734, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 1.6171, "step": 4280 }, { "epoch": 1.3436384322648987, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 1.9601, "step": 4281 }, { "epoch": 1.343952293146063, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 1.6547, "step": 4282 }, { "epoch": 1.3442661540272274, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 1.9258, "step": 4283 }, { "epoch": 1.344580014908392, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 1.8691, "step": 4284 }, { "epoch": 1.3448938757895563, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.7277, "step": 4285 }, { "epoch": 1.3452077366707207, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 2.6264, "step": 4286 }, { "epoch": 1.345521597551885, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.6122, "step": 4287 }, { "epoch": 1.3458354584330494, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.3902, "step": 4288 }, { "epoch": 1.346149319314214, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.2705, "step": 4289 }, { "epoch": 1.3464631801953784, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.4497, "step": 4290 }, { "epoch": 1.346777041076543, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.2441, "step": 4291 }, { "epoch": 1.3470909019577073, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.3874, "step": 4292 }, { "epoch": 1.3474047628388717, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.3759, "step": 4293 }, { "epoch": 1.347718623720036, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.4092, "step": 4294 }, { "epoch": 1.3480324846012004, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.3588, "step": 4295 }, { "epoch": 1.348346345482365, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.5746, "step": 4296 }, { "epoch": 1.3486602063635293, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.3454, "step": 4297 }, { "epoch": 1.3489740672446937, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.4089, "step": 4298 }, { "epoch": 1.3492879281258583, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.51, "step": 4299 }, { "epoch": 1.3496017890070227, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.3044, "step": 4300 }, { "epoch": 1.349915649888187, "grad_norm": 0.2470703125, "learning_rate": 0.0002, "loss": 1.3375, "step": 4301 }, { "epoch": 1.349915649888187, "eval_loss": 1.7903716564178467, "eval_runtime": 123.6262, "eval_samples_per_second": 8.089, "eval_steps_per_second": 8.089, "step": 4301 }, { "epoch": 1.349915649888187, "mmlu_eval_accuracy": 0.3908505406721757, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.35714285714285715, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.6363636363636364, "mmlu_eval_accuracy_clinical_knowledge": 0.3103448275862069, "mmlu_eval_accuracy_college_biology": 0.25, "mmlu_eval_accuracy_college_chemistry": 0.0, "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.2727272727272727, "mmlu_eval_accuracy_computer_security": 0.18181818181818182, "mmlu_eval_accuracy_conceptual_physics": 0.34615384615384615, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.375, "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, "mmlu_eval_accuracy_formal_logic": 0.21428571428571427, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.25, "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365, "mmlu_eval_accuracy_high_school_computer_science": 0.4444444444444444, "mmlu_eval_accuracy_high_school_european_history": 0.6111111111111112, "mmlu_eval_accuracy_high_school_geography": 0.5454545454545454, "mmlu_eval_accuracy_high_school_government_and_politics": 0.47619047619047616, "mmlu_eval_accuracy_high_school_macroeconomics": 0.37209302325581395, "mmlu_eval_accuracy_high_school_mathematics": 0.27586206896551724, "mmlu_eval_accuracy_high_school_microeconomics": 0.2692307692307692, "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, "mmlu_eval_accuracy_high_school_psychology": 0.5833333333333334, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.5, "mmlu_eval_accuracy_high_school_world_history": 0.46153846153846156, "mmlu_eval_accuracy_human_aging": 0.6956521739130435, "mmlu_eval_accuracy_human_sexuality": 0.3333333333333333, "mmlu_eval_accuracy_international_law": 0.6923076923076923, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.6111111111111112, "mmlu_eval_accuracy_machine_learning": 0.18181818181818182, "mmlu_eval_accuracy_management": 0.45454545454545453, "mmlu_eval_accuracy_marketing": 0.68, "mmlu_eval_accuracy_medical_genetics": 0.8181818181818182, "mmlu_eval_accuracy_miscellaneous": 0.5232558139534884, "mmlu_eval_accuracy_moral_disputes": 0.3684210526315789, "mmlu_eval_accuracy_moral_scenarios": 0.27, "mmlu_eval_accuracy_nutrition": 0.36363636363636365, "mmlu_eval_accuracy_philosophy": 0.5, "mmlu_eval_accuracy_prehistory": 0.3142857142857143, "mmlu_eval_accuracy_professional_accounting": 0.45161290322580644, "mmlu_eval_accuracy_professional_law": 0.31176470588235294, "mmlu_eval_accuracy_professional_medicine": 0.45161290322580644, "mmlu_eval_accuracy_professional_psychology": 0.37681159420289856, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.45454545454545453, "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, "mmlu_eval_accuracy_virology": 0.3888888888888889, "mmlu_eval_accuracy_world_religions": 0.631578947368421, "mmlu_loss": 1.1934973221055671, "step": 4301 }, { "epoch": 1.3502295107693514, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.26, "step": 4302 }, { "epoch": 1.350543371650516, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.2986, "step": 4303 }, { "epoch": 1.3508572325316803, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.375, "step": 4304 }, { "epoch": 1.3511710934128447, "grad_norm": 0.2060546875, "learning_rate": 0.0002, "loss": 1.4536, "step": 4305 }, { "epoch": 1.3514849542940093, "grad_norm": 0.2177734375, "learning_rate": 0.0002, "loss": 1.4375, "step": 4306 }, { "epoch": 1.3517988151751736, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.3164, "step": 4307 }, { "epoch": 1.352112676056338, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.6206, "step": 4308 }, { "epoch": 1.3524265369375024, "grad_norm": 0.2412109375, "learning_rate": 0.0002, "loss": 1.388, "step": 4309 }, { "epoch": 1.352740397818667, "grad_norm": 0.2412109375, "learning_rate": 0.0002, "loss": 1.1809, "step": 4310 }, { "epoch": 1.3530542586998313, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.4941, "step": 4311 }, { "epoch": 1.3533681195809957, "grad_norm": 0.2314453125, "learning_rate": 0.0002, "loss": 1.2362, "step": 4312 }, { "epoch": 1.3536819804621603, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.4381, "step": 4313 }, { "epoch": 1.3539958413433246, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.6817, "step": 4314 }, { "epoch": 1.354309702224489, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.4231, "step": 4315 }, { "epoch": 1.3546235631056533, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 1.6306, "step": 4316 }, { "epoch": 1.3549374239868177, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 1.8893, "step": 4317 }, { "epoch": 1.3552512848679823, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 1.791, "step": 4318 }, { "epoch": 1.3555651457491467, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 1.5052, "step": 4319 }, { "epoch": 1.3558790066303112, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 1.6438, "step": 4320 }, { "epoch": 1.3561928675114756, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 1.9806, "step": 4321 }, { "epoch": 1.35650672839264, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 2.0848, "step": 4322 }, { "epoch": 1.3568205892738043, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 1.9008, "step": 4323 }, { "epoch": 1.3571344501549687, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 2.3209, "step": 4324 }, { "epoch": 1.3574483110361333, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 2.3895, "step": 4325 }, { "epoch": 1.3577621719172976, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 1.8199, "step": 4326 }, { "epoch": 1.358076032798462, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 1.7933, "step": 4327 }, { "epoch": 1.3583898936796266, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 2.3329, "step": 4328 }, { "epoch": 1.358703754560791, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 2.1252, "step": 4329 }, { "epoch": 1.3590176154419553, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 1.8916, "step": 4330 }, { "epoch": 1.3593314763231197, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 1.862, "step": 4331 }, { "epoch": 1.3596453372042843, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.8936, "step": 4332 }, { "epoch": 1.3599591980854486, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 2.2088, "step": 4333 }, { "epoch": 1.360273058966613, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.6653, "step": 4334 }, { "epoch": 1.3605869198477776, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 1.849, "step": 4335 }, { "epoch": 1.360900780728942, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 2.4726, "step": 4336 }, { "epoch": 1.3612146416101063, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5145, "step": 4337 }, { "epoch": 1.3615285024912707, "grad_norm": 0.130859375, "learning_rate": 0.0002, "loss": 1.3967, "step": 4338 }, { "epoch": 1.3618423633724352, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.394, "step": 4339 }, { "epoch": 1.3621562242535996, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.4337, "step": 4340 }, { "epoch": 1.362470085134764, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5217, "step": 4341 }, { "epoch": 1.3627839460159286, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.3281, "step": 4342 }, { "epoch": 1.363097806897093, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.2755, "step": 4343 }, { "epoch": 1.3634116677782573, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.3414, "step": 4344 }, { "epoch": 1.3637255286594216, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.285, "step": 4345 }, { "epoch": 1.364039389540586, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.3532, "step": 4346 }, { "epoch": 1.3643532504217506, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.3468, "step": 4347 }, { "epoch": 1.364667111302915, "grad_norm": 0.1923828125, "learning_rate": 0.0002, "loss": 1.2533, "step": 4348 }, { "epoch": 1.3649809721840795, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.4034, "step": 4349 }, { "epoch": 1.365294833065244, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.332, "step": 4350 }, { "epoch": 1.3656086939464083, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.3745, "step": 4351 }, { "epoch": 1.3659225548275726, "grad_norm": 0.2138671875, "learning_rate": 0.0002, "loss": 1.3052, "step": 4352 }, { "epoch": 1.366236415708737, "grad_norm": 0.2001953125, "learning_rate": 0.0002, "loss": 1.4312, "step": 4353 }, { "epoch": 1.3665502765899016, "grad_norm": 0.22265625, "learning_rate": 0.0002, "loss": 1.4191, "step": 4354 }, { "epoch": 1.366864137471066, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.4128, "step": 4355 }, { "epoch": 1.3671779983522303, "grad_norm": 0.2001953125, "learning_rate": 0.0002, "loss": 1.2835, "step": 4356 }, { "epoch": 1.367491859233395, "grad_norm": 0.2177734375, "learning_rate": 0.0002, "loss": 1.4531, "step": 4357 }, { "epoch": 1.3678057201145593, "grad_norm": 0.208984375, "learning_rate": 0.0002, "loss": 1.4206, "step": 4358 }, { "epoch": 1.3681195809957236, "grad_norm": 0.224609375, "learning_rate": 0.0002, "loss": 1.4837, "step": 4359 }, { "epoch": 1.368433441876888, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.7016, "step": 4360 }, { "epoch": 1.3687473027580526, "grad_norm": 0.2177734375, "learning_rate": 0.0002, "loss": 1.2667, "step": 4361 }, { "epoch": 1.369061163639217, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.4391, "step": 4362 }, { "epoch": 1.3693750245203813, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.2737, "step": 4363 }, { "epoch": 1.3696888854015459, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 1.52, "step": 4364 }, { "epoch": 1.3700027462827102, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 1.5624, "step": 4365 }, { "epoch": 1.3703166071638746, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 1.5594, "step": 4366 }, { "epoch": 1.370630468045039, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 1.571, "step": 4367 }, { "epoch": 1.3709443289262033, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 1.8231, "step": 4368 }, { "epoch": 1.371258189807368, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.6716, "step": 4369 }, { "epoch": 1.3715720506885323, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 1.8874, "step": 4370 }, { "epoch": 1.3718859115696969, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.8814, "step": 4371 }, { "epoch": 1.3721997724508612, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 2.0901, "step": 4372 }, { "epoch": 1.3725136333320256, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 2.1328, "step": 4373 }, { "epoch": 1.37282749421319, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 2.1256, "step": 4374 }, { "epoch": 1.3731413550943543, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 2.0031, "step": 4375 }, { "epoch": 1.373455215975519, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 1.8179, "step": 4376 }, { "epoch": 1.3737690768566833, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 2.0201, "step": 4377 }, { "epoch": 1.3740829377378476, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 2.0034, "step": 4378 }, { "epoch": 1.3743967986190122, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 2.3234, "step": 4379 }, { "epoch": 1.3747106595001766, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.5664, "step": 4380 }, { "epoch": 1.375024520381341, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 1.8863, "step": 4381 }, { "epoch": 1.3753383812625053, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.3873, "step": 4382 }, { "epoch": 1.3756522421436699, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.6511, "step": 4383 }, { "epoch": 1.3759661030248342, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 2.1379, "step": 4384 }, { "epoch": 1.3762799639059986, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 2.1523, "step": 4385 }, { "epoch": 1.3765938247871632, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 2.4014, "step": 4386 }, { "epoch": 1.3769076856683276, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.3388, "step": 4387 }, { "epoch": 1.377221546549492, "grad_norm": 0.10498046875, "learning_rate": 0.0002, "loss": 1.2743, "step": 4388 }, { "epoch": 1.3775354074306563, "grad_norm": 0.130859375, "learning_rate": 0.0002, "loss": 1.2867, "step": 4389 }, { "epoch": 1.3778492683118209, "grad_norm": 0.1328125, "learning_rate": 0.0002, "loss": 1.3837, "step": 4390 }, { "epoch": 1.3781631291929852, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.3365, "step": 4391 }, { "epoch": 1.3784769900741496, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.2984, "step": 4392 }, { "epoch": 1.3787908509553142, "grad_norm": 0.1875, "learning_rate": 0.0002, "loss": 1.3257, "step": 4393 }, { "epoch": 1.3791047118364785, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.1303, "step": 4394 }, { "epoch": 1.379418572717643, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.3418, "step": 4395 }, { "epoch": 1.3797324335988073, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.4456, "step": 4396 }, { "epoch": 1.3800462944799716, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.2782, "step": 4397 }, { "epoch": 1.3803601553611362, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.2839, "step": 4398 }, { "epoch": 1.3806740162423006, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.4818, "step": 4399 }, { "epoch": 1.3809878771234652, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.3646, "step": 4400 }, { "epoch": 1.3813017380046295, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.3834, "step": 4401 }, { "epoch": 1.3816155988857939, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.2856, "step": 4402 }, { "epoch": 1.3819294597669582, "grad_norm": 0.1875, "learning_rate": 0.0002, "loss": 1.336, "step": 4403 }, { "epoch": 1.3822433206481226, "grad_norm": 0.2021484375, "learning_rate": 0.0002, "loss": 1.351, "step": 4404 }, { "epoch": 1.3825571815292872, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 1.5375, "step": 4405 }, { "epoch": 1.3828710424104516, "grad_norm": 0.2216796875, "learning_rate": 0.0002, "loss": 1.2959, "step": 4406 }, { "epoch": 1.383184903291616, "grad_norm": 0.2333984375, "learning_rate": 0.0002, "loss": 1.2444, "step": 4407 }, { "epoch": 1.3834987641727805, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 1.275, "step": 4408 }, { "epoch": 1.3838126250539449, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.4684, "step": 4409 }, { "epoch": 1.3841264859351092, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.3096, "step": 4410 }, { "epoch": 1.3844403468162736, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.5674, "step": 4411 }, { "epoch": 1.3847542076974382, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.3748, "step": 4412 }, { "epoch": 1.3850680685786025, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 1.4567, "step": 4413 }, { "epoch": 1.385381929459767, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.2749, "step": 4414 }, { "epoch": 1.3856957903409315, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.4029, "step": 4415 }, { "epoch": 1.3860096512220959, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.6521, "step": 4416 }, { "epoch": 1.3863235121032602, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.7173, "step": 4417 }, { "epoch": 1.3866373729844246, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 1.4553, "step": 4418 }, { "epoch": 1.386951233865589, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 1.8413, "step": 4419 }, { "epoch": 1.3872650947467535, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 1.807, "step": 4420 }, { "epoch": 1.3875789556279179, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 1.684, "step": 4421 }, { "epoch": 1.3878928165090825, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 1.9317, "step": 4422 }, { "epoch": 1.3882066773902468, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.7097, "step": 4423 }, { "epoch": 1.3885205382714112, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 2.3175, "step": 4424 }, { "epoch": 1.3888343991525756, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 1.8468, "step": 4425 }, { "epoch": 1.38914826003374, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 1.85, "step": 4426 }, { "epoch": 1.3894621209149045, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 2.0743, "step": 4427 }, { "epoch": 1.3897759817960689, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 2.0869, "step": 4428 }, { "epoch": 1.3900898426772332, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 1.5633, "step": 4429 }, { "epoch": 1.3904037035583978, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 2.1579, "step": 4430 }, { "epoch": 1.3907175644395622, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 1.4854, "step": 4431 }, { "epoch": 1.3910314253207265, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.6308, "step": 4432 }, { "epoch": 1.391345286201891, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 1.7707, "step": 4433 }, { "epoch": 1.3916591470830555, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 1.8306, "step": 4434 }, { "epoch": 1.3919730079642199, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 2.0241, "step": 4435 }, { "epoch": 1.3922868688453842, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 2.3202, "step": 4436 }, { "epoch": 1.3926007297265488, "grad_norm": 0.189453125, "learning_rate": 0.0002, "loss": 1.4694, "step": 4437 }, { "epoch": 1.3929145906077132, "grad_norm": 0.1220703125, "learning_rate": 0.0002, "loss": 1.3261, "step": 4438 }, { "epoch": 1.3932284514888775, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.4208, "step": 4439 }, { "epoch": 1.393542312370042, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.4576, "step": 4440 }, { "epoch": 1.3938561732512065, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.3911, "step": 4441 }, { "epoch": 1.3941700341323708, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.3504, "step": 4442 }, { "epoch": 1.3944838950135352, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.2102, "step": 4443 }, { "epoch": 1.3947977558946998, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.3065, "step": 4444 }, { "epoch": 1.3951116167758641, "grad_norm": 0.197265625, "learning_rate": 0.0002, "loss": 1.3616, "step": 4445 }, { "epoch": 1.3954254776570285, "grad_norm": 0.1884765625, "learning_rate": 0.0002, "loss": 1.3182, "step": 4446 }, { "epoch": 1.3957393385381929, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.4715, "step": 4447 }, { "epoch": 1.3960531994193572, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.259, "step": 4448 }, { "epoch": 1.3963670603005218, "grad_norm": 0.236328125, "learning_rate": 0.0002, "loss": 1.4684, "step": 4449 }, { "epoch": 1.3966809211816862, "grad_norm": 0.2041015625, "learning_rate": 0.0002, "loss": 1.4605, "step": 4450 }, { "epoch": 1.3969947820628508, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.2664, "step": 4451 }, { "epoch": 1.3973086429440151, "grad_norm": 0.1865234375, "learning_rate": 0.0002, "loss": 1.3576, "step": 4452 }, { "epoch": 1.3976225038251795, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.2497, "step": 4453 }, { "epoch": 1.3979363647063439, "grad_norm": 0.1884765625, "learning_rate": 0.0002, "loss": 1.2739, "step": 4454 }, { "epoch": 1.3982502255875082, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.4105, "step": 4455 }, { "epoch": 1.3985640864686728, "grad_norm": 0.197265625, "learning_rate": 0.0002, "loss": 1.293, "step": 4456 }, { "epoch": 1.3988779473498372, "grad_norm": 0.236328125, "learning_rate": 0.0002, "loss": 1.4529, "step": 4457 }, { "epoch": 1.3991918082310015, "grad_norm": 0.2216796875, "learning_rate": 0.0002, "loss": 1.2133, "step": 4458 }, { "epoch": 1.3995056691121661, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.559, "step": 4459 }, { "epoch": 1.3998195299933305, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.2974, "step": 4460 }, { "epoch": 1.4001333908744948, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.4939, "step": 4461 }, { "epoch": 1.4004472517556592, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.4051, "step": 4462 }, { "epoch": 1.4007611126368238, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.4685, "step": 4463 }, { "epoch": 1.4010749735179882, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.5184, "step": 4464 }, { "epoch": 1.4013888343991525, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 1.9703, "step": 4465 }, { "epoch": 1.401702695280317, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.9405, "step": 4466 }, { "epoch": 1.4020165561614815, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 1.9013, "step": 4467 }, { "epoch": 1.4023304170426458, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 1.8151, "step": 4468 }, { "epoch": 1.4026442779238102, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 2.0511, "step": 4469 }, { "epoch": 1.4029581388049748, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 2.1633, "step": 4470 }, { "epoch": 1.4032719996861391, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 2.0867, "step": 4471 }, { "epoch": 1.4035858605673035, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 1.6026, "step": 4472 }, { "epoch": 1.403899721448468, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.9118, "step": 4473 }, { "epoch": 1.4042135823296324, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 1.4391, "step": 4474 }, { "epoch": 1.4045274432107968, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 1.8097, "step": 4475 }, { "epoch": 1.4048413040919612, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 2.2578, "step": 4476 }, { "epoch": 1.4051551649731255, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 2.0483, "step": 4477 }, { "epoch": 1.4054690258542901, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 2.2424, "step": 4478 }, { "epoch": 1.4057828867354545, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 2.3727, "step": 4479 }, { "epoch": 1.406096747616619, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 1.8564, "step": 4480 }, { "epoch": 1.4064106084977834, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 1.9299, "step": 4481 }, { "epoch": 1.4067244693789478, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 2.0507, "step": 4482 }, { "epoch": 1.4070383302601122, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 2.2124, "step": 4483 }, { "epoch": 1.4073521911412765, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 2.0349, "step": 4484 }, { "epoch": 1.407666052022441, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 2.1551, "step": 4485 }, { "epoch": 1.4079799129036055, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 2.9484, "step": 4486 }, { "epoch": 1.4082937737847698, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.3464, "step": 4487 }, { "epoch": 1.4086076346659344, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.4638, "step": 4488 }, { "epoch": 1.4086076346659344, "eval_loss": 1.7950125932693481, "eval_runtime": 122.7541, "eval_samples_per_second": 8.146, "eval_steps_per_second": 8.146, "step": 4488 }, { "epoch": 1.4086076346659344, "mmlu_eval_accuracy": 0.41516172277955254, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, "mmlu_eval_accuracy_clinical_knowledge": 0.41379310344827586, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.18181818181818182, "mmlu_eval_accuracy_conceptual_physics": 0.38461538461538464, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.375, "mmlu_eval_accuracy_elementary_mathematics": 0.3170731707317073, "mmlu_eval_accuracy_formal_logic": 0.2857142857142857, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.28125, "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, "mmlu_eval_accuracy_high_school_european_history": 0.6666666666666666, "mmlu_eval_accuracy_high_school_geography": 0.6363636363636364, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5714285714285714, "mmlu_eval_accuracy_high_school_macroeconomics": 0.37209302325581395, "mmlu_eval_accuracy_high_school_mathematics": 0.2413793103448276, "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, "mmlu_eval_accuracy_high_school_psychology": 0.6333333333333333, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.5454545454545454, "mmlu_eval_accuracy_high_school_world_history": 0.5384615384615384, "mmlu_eval_accuracy_human_aging": 0.6086956521739131, "mmlu_eval_accuracy_human_sexuality": 0.3333333333333333, "mmlu_eval_accuracy_international_law": 0.7692307692307693, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.6111111111111112, "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, "mmlu_eval_accuracy_management": 0.45454545454545453, "mmlu_eval_accuracy_marketing": 0.72, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.5116279069767442, "mmlu_eval_accuracy_moral_disputes": 0.4473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.29, "mmlu_eval_accuracy_nutrition": 0.45454545454545453, "mmlu_eval_accuracy_philosophy": 0.5, "mmlu_eval_accuracy_prehistory": 0.3142857142857143, "mmlu_eval_accuracy_professional_accounting": 0.41935483870967744, "mmlu_eval_accuracy_professional_law": 0.31176470588235294, "mmlu_eval_accuracy_professional_medicine": 0.45161290322580644, "mmlu_eval_accuracy_professional_psychology": 0.4057971014492754, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.45454545454545453, "mmlu_eval_accuracy_us_foreign_policy": 0.5454545454545454, "mmlu_eval_accuracy_virology": 0.3333333333333333, "mmlu_eval_accuracy_world_religions": 0.631578947368421, "mmlu_loss": 1.2162521599574778, "step": 4488 }, { "epoch": 1.4089214955470988, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.4172, "step": 4489 }, { "epoch": 1.4092353564282631, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.3756, "step": 4490 }, { "epoch": 1.4095492173094275, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.3264, "step": 4491 }, { "epoch": 1.409863078190592, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.3612, "step": 4492 }, { "epoch": 1.4101769390717565, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.3923, "step": 4493 }, { "epoch": 1.4104907999529208, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.452, "step": 4494 }, { "epoch": 1.4108046608340854, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.2037, "step": 4495 }, { "epoch": 1.4111185217152498, "grad_norm": 0.193359375, "learning_rate": 0.0002, "loss": 1.4086, "step": 4496 }, { "epoch": 1.4114323825964141, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.3997, "step": 4497 }, { "epoch": 1.4117462434775785, "grad_norm": 0.2158203125, "learning_rate": 0.0002, "loss": 1.4006, "step": 4498 }, { "epoch": 1.4120601043587429, "grad_norm": 0.193359375, "learning_rate": 0.0002, "loss": 1.4248, "step": 4499 }, { "epoch": 1.4123739652399074, "grad_norm": 0.216796875, "learning_rate": 0.0002, "loss": 1.2983, "step": 4500 }, { "epoch": 1.4126878261210718, "grad_norm": 0.1884765625, "learning_rate": 0.0002, "loss": 1.2411, "step": 4501 }, { "epoch": 1.4130016870022364, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.404, "step": 4502 }, { "epoch": 1.4133155478834007, "grad_norm": 0.2001953125, "learning_rate": 0.0002, "loss": 1.2068, "step": 4503 }, { "epoch": 1.413629408764565, "grad_norm": 0.212890625, "learning_rate": 0.0002, "loss": 1.4444, "step": 4504 }, { "epoch": 1.4139432696457295, "grad_norm": 0.2021484375, "learning_rate": 0.0002, "loss": 1.3189, "step": 4505 }, { "epoch": 1.4142571305268938, "grad_norm": 0.228515625, "learning_rate": 0.0002, "loss": 1.4255, "step": 4506 }, { "epoch": 1.4145709914080584, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 1.2476, "step": 4507 }, { "epoch": 1.4148848522892228, "grad_norm": 0.23828125, "learning_rate": 0.0002, "loss": 1.5722, "step": 4508 }, { "epoch": 1.4151987131703871, "grad_norm": 0.2001953125, "learning_rate": 0.0002, "loss": 1.182, "step": 4509 }, { "epoch": 1.4155125740515517, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.5858, "step": 4510 }, { "epoch": 1.415826434932716, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 1.3466, "step": 4511 }, { "epoch": 1.4161402958138805, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.2869, "step": 4512 }, { "epoch": 1.4164541566950448, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.662, "step": 4513 }, { "epoch": 1.4167680175762094, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.4312, "step": 4514 }, { "epoch": 1.4170818784573738, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.5378, "step": 4515 }, { "epoch": 1.4173957393385381, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.405, "step": 4516 }, { "epoch": 1.4177096002197027, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.5854, "step": 4517 }, { "epoch": 1.418023461100867, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 1.5027, "step": 4518 }, { "epoch": 1.4183373219820314, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 1.8634, "step": 4519 }, { "epoch": 1.4186511828631958, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 2.2005, "step": 4520 }, { "epoch": 1.4189650437443604, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.8067, "step": 4521 }, { "epoch": 1.4192789046255248, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 2.3058, "step": 4522 }, { "epoch": 1.4195927655066891, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 1.7765, "step": 4523 }, { "epoch": 1.4199066263878537, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.8979, "step": 4524 }, { "epoch": 1.420220487269018, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 2.0376, "step": 4525 }, { "epoch": 1.4205343481501824, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 2.2349, "step": 4526 }, { "epoch": 1.4208482090313468, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 1.8554, "step": 4527 }, { "epoch": 1.4211620699125111, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 2.308, "step": 4528 }, { "epoch": 1.4214759307936757, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 2.1704, "step": 4529 }, { "epoch": 1.42178979167484, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 2.1974, "step": 4530 }, { "epoch": 1.4221036525560047, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 2.3392, "step": 4531 }, { "epoch": 1.422417513437169, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 2.5634, "step": 4532 }, { "epoch": 1.4227313743183334, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.7705, "step": 4533 }, { "epoch": 1.4230452351994978, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.6408, "step": 4534 }, { "epoch": 1.4233590960806621, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 2.1068, "step": 4535 }, { "epoch": 1.4236729569618267, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 2.5958, "step": 4536 }, { "epoch": 1.423986817842991, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6286, "step": 4537 }, { "epoch": 1.4243006787241554, "grad_norm": 0.12109375, "learning_rate": 0.0002, "loss": 1.322, "step": 4538 }, { "epoch": 1.42461453960532, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.6027, "step": 4539 }, { "epoch": 1.4249284004864844, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.3281, "step": 4540 }, { "epoch": 1.4252422613676488, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.4585, "step": 4541 }, { "epoch": 1.4255561222488131, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.3164, "step": 4542 }, { "epoch": 1.4258699831299777, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.4013, "step": 4543 }, { "epoch": 1.426183844011142, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.4631, "step": 4544 }, { "epoch": 1.4264977048923064, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.4571, "step": 4545 }, { "epoch": 1.426811565773471, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.2838, "step": 4546 }, { "epoch": 1.4271254266546354, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.2789, "step": 4547 }, { "epoch": 1.4274392875357997, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.4884, "step": 4548 }, { "epoch": 1.427753148416964, "grad_norm": 0.19921875, "learning_rate": 0.0002, "loss": 1.3723, "step": 4549 }, { "epoch": 1.4280670092981287, "grad_norm": 0.2109375, "learning_rate": 0.0002, "loss": 1.4656, "step": 4550 }, { "epoch": 1.428380870179293, "grad_norm": 0.21484375, "learning_rate": 0.0002, "loss": 1.3133, "step": 4551 }, { "epoch": 1.4286947310604574, "grad_norm": 0.216796875, "learning_rate": 0.0002, "loss": 1.4014, "step": 4552 }, { "epoch": 1.429008591941622, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.455, "step": 4553 }, { "epoch": 1.4293224528227864, "grad_norm": 0.2119140625, "learning_rate": 0.0002, "loss": 1.3893, "step": 4554 }, { "epoch": 1.4296363137039507, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.4411, "step": 4555 }, { "epoch": 1.429950174585115, "grad_norm": 0.22265625, "learning_rate": 0.0002, "loss": 1.3577, "step": 4556 }, { "epoch": 1.4302640354662794, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.2991, "step": 4557 }, { "epoch": 1.430577896347444, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.4277, "step": 4558 }, { "epoch": 1.4308917572286084, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.6938, "step": 4559 }, { "epoch": 1.431205618109773, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.2626, "step": 4560 }, { "epoch": 1.4315194789909373, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.3176, "step": 4561 }, { "epoch": 1.4318333398721017, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.3864, "step": 4562 }, { "epoch": 1.432147200753266, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.3965, "step": 4563 }, { "epoch": 1.4324610616344304, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.2969, "step": 4564 }, { "epoch": 1.432774922515595, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.523, "step": 4565 }, { "epoch": 1.4330887833967594, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.2708, "step": 4566 }, { "epoch": 1.4334026442779237, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 1.7182, "step": 4567 }, { "epoch": 1.4337165051590883, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 1.9066, "step": 4568 }, { "epoch": 1.4340303660402527, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.7612, "step": 4569 }, { "epoch": 1.434344226921417, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 1.8798, "step": 4570 }, { "epoch": 1.4346580878025814, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 1.8699, "step": 4571 }, { "epoch": 1.434971948683746, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 1.9965, "step": 4572 }, { "epoch": 1.4352858095649104, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 2.0068, "step": 4573 }, { "epoch": 1.4355996704460747, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 2.344, "step": 4574 }, { "epoch": 1.4359135313272393, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 2.4216, "step": 4575 }, { "epoch": 1.4362273922084037, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.9731, "step": 4576 }, { "epoch": 1.436541253089568, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 2.6413, "step": 4577 }, { "epoch": 1.4368551139707324, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 1.869, "step": 4578 }, { "epoch": 1.4371689748518968, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 1.9478, "step": 4579 }, { "epoch": 1.4374828357330613, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 1.5531, "step": 4580 }, { "epoch": 1.4377966966142257, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 1.9357, "step": 4581 }, { "epoch": 1.4381105574953903, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 1.9204, "step": 4582 }, { "epoch": 1.4384244183765547, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 1.7352, "step": 4583 }, { "epoch": 1.438738279257719, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 1.7622, "step": 4584 }, { "epoch": 1.4390521401388834, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 2.2262, "step": 4585 }, { "epoch": 1.4393660010200477, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 2.381, "step": 4586 }, { "epoch": 1.4396798619012123, "grad_norm": 0.2060546875, "learning_rate": 0.0002, "loss": 1.5675, "step": 4587 }, { "epoch": 1.4399937227823767, "grad_norm": 0.1259765625, "learning_rate": 0.0002, "loss": 1.3227, "step": 4588 }, { "epoch": 1.440307583663541, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.503, "step": 4589 }, { "epoch": 1.4406214445447056, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.4612, "step": 4590 }, { "epoch": 1.44093530542587, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.381, "step": 4591 }, { "epoch": 1.4412491663070344, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.5395, "step": 4592 }, { "epoch": 1.4415630271881987, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.4429, "step": 4593 }, { "epoch": 1.4418768880693633, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.4253, "step": 4594 }, { "epoch": 1.4421907489505277, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.3615, "step": 4595 }, { "epoch": 1.442504609831692, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.1712, "step": 4596 }, { "epoch": 1.4428184707128566, "grad_norm": 0.1884765625, "learning_rate": 0.0002, "loss": 1.3181, "step": 4597 }, { "epoch": 1.443132331594021, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.2507, "step": 4598 }, { "epoch": 1.4434461924751854, "grad_norm": 0.1953125, "learning_rate": 0.0002, "loss": 1.4418, "step": 4599 }, { "epoch": 1.4437600533563497, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.3556, "step": 4600 }, { "epoch": 1.4440739142375143, "grad_norm": 0.2158203125, "learning_rate": 0.0002, "loss": 1.3598, "step": 4601 }, { "epoch": 1.4443877751186787, "grad_norm": 0.1982421875, "learning_rate": 0.0002, "loss": 1.4568, "step": 4602 }, { "epoch": 1.444701635999843, "grad_norm": 0.22265625, "learning_rate": 0.0002, "loss": 1.3942, "step": 4603 }, { "epoch": 1.4450154968810076, "grad_norm": 0.185546875, "learning_rate": 0.0002, "loss": 1.2559, "step": 4604 }, { "epoch": 1.445329357762172, "grad_norm": 0.203125, "learning_rate": 0.0002, "loss": 1.3777, "step": 4605 }, { "epoch": 1.4456432186433363, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.3852, "step": 4606 }, { "epoch": 1.4459570795245007, "grad_norm": 0.232421875, "learning_rate": 0.0002, "loss": 1.4648, "step": 4607 }, { "epoch": 1.446270940405665, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.4135, "step": 4608 }, { "epoch": 1.4465848012868296, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.6244, "step": 4609 }, { "epoch": 1.446898662167994, "grad_norm": 0.236328125, "learning_rate": 0.0002, "loss": 1.3172, "step": 4610 }, { "epoch": 1.4472125230491586, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.6917, "step": 4611 }, { "epoch": 1.447526383930323, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.4561, "step": 4612 }, { "epoch": 1.4478402448114873, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 1.6565, "step": 4613 }, { "epoch": 1.4481541056926517, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 1.8942, "step": 4614 }, { "epoch": 1.448467966573816, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 1.7879, "step": 4615 }, { "epoch": 1.4487818274549806, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 1.9602, "step": 4616 }, { "epoch": 1.449095688336145, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 1.8152, "step": 4617 }, { "epoch": 1.4494095492173094, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 2.0924, "step": 4618 }, { "epoch": 1.449723410098474, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.757, "step": 4619 }, { "epoch": 1.4500372709796383, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.6861, "step": 4620 }, { "epoch": 1.4503511318608027, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 1.6873, "step": 4621 }, { "epoch": 1.450664992741967, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 2.0753, "step": 4622 }, { "epoch": 1.4509788536231316, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 1.788, "step": 4623 }, { "epoch": 1.451292714504296, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 2.0385, "step": 4624 }, { "epoch": 1.4516065753854603, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 1.82, "step": 4625 }, { "epoch": 1.451920436266625, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 2.1059, "step": 4626 }, { "epoch": 1.4522342971477893, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 2.2155, "step": 4627 }, { "epoch": 1.4525481580289537, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 2.1566, "step": 4628 }, { "epoch": 1.452862018910118, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 1.7559, "step": 4629 }, { "epoch": 1.4531758797912824, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 2.2951, "step": 4630 }, { "epoch": 1.453489740672447, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 1.9572, "step": 4631 }, { "epoch": 1.4538036015536113, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 2.035, "step": 4632 }, { "epoch": 1.454117462434776, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 1.622, "step": 4633 }, { "epoch": 1.4544313233159403, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 1.8148, "step": 4634 }, { "epoch": 1.4547451841971046, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 2.4329, "step": 4635 }, { "epoch": 1.455059045078269, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 2.4666, "step": 4636 }, { "epoch": 1.4553729059594334, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.5192, "step": 4637 }, { "epoch": 1.455686766840598, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.4045, "step": 4638 }, { "epoch": 1.4560006277217623, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.3559, "step": 4639 }, { "epoch": 1.4563144886029267, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.3047, "step": 4640 }, { "epoch": 1.4566283494840913, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.4302, "step": 4641 }, { "epoch": 1.4569422103652556, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.3603, "step": 4642 }, { "epoch": 1.45725607124642, "grad_norm": 0.1923828125, "learning_rate": 0.0002, "loss": 1.3478, "step": 4643 }, { "epoch": 1.4575699321275843, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.4566, "step": 4644 }, { "epoch": 1.457883793008749, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.4345, "step": 4645 }, { "epoch": 1.4581976538899133, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.3806, "step": 4646 }, { "epoch": 1.4585115147710777, "grad_norm": 0.2001953125, "learning_rate": 0.0002, "loss": 1.4406, "step": 4647 }, { "epoch": 1.4588253756522422, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.3077, "step": 4648 }, { "epoch": 1.4591392365334066, "grad_norm": 0.1884765625, "learning_rate": 0.0002, "loss": 1.4456, "step": 4649 }, { "epoch": 1.459453097414571, "grad_norm": 0.240234375, "learning_rate": 0.0002, "loss": 1.4145, "step": 4650 }, { "epoch": 1.4597669582957353, "grad_norm": 0.1923828125, "learning_rate": 0.0002, "loss": 1.3238, "step": 4651 }, { "epoch": 1.4600808191769, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.1412, "step": 4652 }, { "epoch": 1.4603946800580643, "grad_norm": 0.197265625, "learning_rate": 0.0002, "loss": 1.2406, "step": 4653 }, { "epoch": 1.4607085409392286, "grad_norm": 0.220703125, "learning_rate": 0.0002, "loss": 1.4524, "step": 4654 }, { "epoch": 1.4610224018203932, "grad_norm": 0.2060546875, "learning_rate": 0.0002, "loss": 1.2858, "step": 4655 }, { "epoch": 1.4613362627015576, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.2985, "step": 4656 }, { "epoch": 1.461650123582722, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 1.6286, "step": 4657 }, { "epoch": 1.4619639844638863, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.687, "step": 4658 }, { "epoch": 1.4622778453450507, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.3283, "step": 4659 }, { "epoch": 1.4625917062262153, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.3415, "step": 4660 }, { "epoch": 1.4629055671073796, "grad_norm": 0.240234375, "learning_rate": 0.0002, "loss": 1.2517, "step": 4661 }, { "epoch": 1.4632194279885442, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.3651, "step": 4662 }, { "epoch": 1.4635332888697086, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 1.68, "step": 4663 }, { "epoch": 1.463847149750873, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 1.8188, "step": 4664 }, { "epoch": 1.4641610106320373, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.6716, "step": 4665 }, { "epoch": 1.4644748715132017, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 1.6055, "step": 4666 }, { "epoch": 1.4647887323943662, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.7878, "step": 4667 }, { "epoch": 1.4651025932755306, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.5551, "step": 4668 }, { "epoch": 1.465416454156695, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.8907, "step": 4669 }, { "epoch": 1.4657303150378596, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 1.9346, "step": 4670 }, { "epoch": 1.466044175919024, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 1.6004, "step": 4671 }, { "epoch": 1.4663580368001883, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 1.8251, "step": 4672 }, { "epoch": 1.4666718976813526, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 2.1775, "step": 4673 }, { "epoch": 1.4669857585625172, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.7053, "step": 4674 }, { "epoch": 1.4672996194436816, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 2.1193, "step": 4675 }, { "epoch": 1.4672996194436816, "eval_loss": 1.7845886945724487, "eval_runtime": 123.49, "eval_samples_per_second": 8.098, "eval_steps_per_second": 8.098, "step": 4675 }, { "epoch": 1.4672996194436816, "mmlu_eval_accuracy": 0.39873369989462604, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.5, "mmlu_eval_accuracy_business_ethics": 0.6363636363636364, "mmlu_eval_accuracy_clinical_knowledge": 0.3448275862068966, "mmlu_eval_accuracy_college_biology": 0.25, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, "mmlu_eval_accuracy_college_physics": 0.2727272727272727, "mmlu_eval_accuracy_computer_security": 0.18181818181818182, "mmlu_eval_accuracy_conceptual_physics": 0.34615384615384615, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.25, "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, "mmlu_eval_accuracy_formal_logic": 0.2857142857142857, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.25, "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, "mmlu_eval_accuracy_high_school_european_history": 0.5, "mmlu_eval_accuracy_high_school_geography": 0.5909090909090909, "mmlu_eval_accuracy_high_school_government_and_politics": 0.47619047619047616, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3488372093023256, "mmlu_eval_accuracy_high_school_mathematics": 0.27586206896551724, "mmlu_eval_accuracy_high_school_microeconomics": 0.2692307692307692, "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, "mmlu_eval_accuracy_high_school_psychology": 0.55, "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, "mmlu_eval_accuracy_high_school_us_history": 0.5454545454545454, "mmlu_eval_accuracy_high_school_world_history": 0.4230769230769231, "mmlu_eval_accuracy_human_aging": 0.6086956521739131, "mmlu_eval_accuracy_human_sexuality": 0.3333333333333333, "mmlu_eval_accuracy_international_law": 0.6923076923076923, "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, "mmlu_eval_accuracy_logical_fallacies": 0.6111111111111112, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.68, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.5232558139534884, "mmlu_eval_accuracy_moral_disputes": 0.4473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.27, "mmlu_eval_accuracy_nutrition": 0.48484848484848486, "mmlu_eval_accuracy_philosophy": 0.5, "mmlu_eval_accuracy_prehistory": 0.2857142857142857, "mmlu_eval_accuracy_professional_accounting": 0.25806451612903225, "mmlu_eval_accuracy_professional_law": 0.29411764705882354, "mmlu_eval_accuracy_professional_medicine": 0.4838709677419355, "mmlu_eval_accuracy_professional_psychology": 0.4057971014492754, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.5454545454545454, "mmlu_eval_accuracy_us_foreign_policy": 0.45454545454545453, "mmlu_eval_accuracy_virology": 0.4444444444444444, "mmlu_eval_accuracy_world_religions": 0.7368421052631579, "mmlu_loss": 1.18201911997398, "step": 4675 }, { "epoch": 1.467613480324846, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 1.9196, "step": 4676 }, { "epoch": 1.4679273412060105, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 1.9435, "step": 4677 }, { "epoch": 1.468241202087175, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 1.6398, "step": 4678 }, { "epoch": 1.4685550629683393, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.7233, "step": 4679 }, { "epoch": 1.4688689238495036, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 2.2635, "step": 4680 }, { "epoch": 1.4691827847306682, "grad_norm": 1.1640625, "learning_rate": 0.0002, "loss": 1.9, "step": 4681 }, { "epoch": 1.4694966456118326, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 2.0136, "step": 4682 }, { "epoch": 1.469810506492997, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 1.9494, "step": 4683 }, { "epoch": 1.4701243673741615, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 1.9991, "step": 4684 }, { "epoch": 1.4704382282553259, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 2.1223, "step": 4685 }, { "epoch": 1.4707520891364902, "grad_norm": 0.90234375, "learning_rate": 0.0002, "loss": 2.8974, "step": 4686 }, { "epoch": 1.4710659500176546, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.4895, "step": 4687 }, { "epoch": 1.471379810898819, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.2938, "step": 4688 }, { "epoch": 1.4716936717799836, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5057, "step": 4689 }, { "epoch": 1.472007532661148, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.4302, "step": 4690 }, { "epoch": 1.4723213935423125, "grad_norm": 0.1845703125, "learning_rate": 0.0002, "loss": 1.4823, "step": 4691 }, { "epoch": 1.4726352544234769, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.4628, "step": 4692 }, { "epoch": 1.4729491153046412, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.4033, "step": 4693 }, { "epoch": 1.4732629761858056, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.2811, "step": 4694 }, { "epoch": 1.47357683706697, "grad_norm": 0.193359375, "learning_rate": 0.0002, "loss": 1.2457, "step": 4695 }, { "epoch": 1.4738906979481345, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.3602, "step": 4696 }, { "epoch": 1.474204558829299, "grad_norm": 0.2099609375, "learning_rate": 0.0002, "loss": 1.427, "step": 4697 }, { "epoch": 1.4745184197104633, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.2397, "step": 4698 }, { "epoch": 1.4748322805916279, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.2726, "step": 4699 }, { "epoch": 1.4751461414727922, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.5702, "step": 4700 }, { "epoch": 1.4754600023539566, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.4863, "step": 4701 }, { "epoch": 1.475773863235121, "grad_norm": 0.228515625, "learning_rate": 0.0002, "loss": 1.6425, "step": 4702 }, { "epoch": 1.4760877241162855, "grad_norm": 0.19140625, "learning_rate": 0.0002, "loss": 1.2094, "step": 4703 }, { "epoch": 1.47640158499745, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 1.3114, "step": 4704 }, { "epoch": 1.4767154458786143, "grad_norm": 0.244140625, "learning_rate": 0.0002, "loss": 1.3582, "step": 4705 }, { "epoch": 1.4770293067597788, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.367, "step": 4706 }, { "epoch": 1.4773431676409432, "grad_norm": 0.234375, "learning_rate": 0.0002, "loss": 1.264, "step": 4707 }, { "epoch": 1.4776570285221076, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.5233, "step": 4708 }, { "epoch": 1.477970889403272, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.388, "step": 4709 }, { "epoch": 1.4782847502844363, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.4282, "step": 4710 }, { "epoch": 1.4785986111656009, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 1.3807, "step": 4711 }, { "epoch": 1.4789124720467652, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 1.6471, "step": 4712 }, { "epoch": 1.4792263329279298, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 1.6978, "step": 4713 }, { "epoch": 1.4795401938090942, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 1.7307, "step": 4714 }, { "epoch": 1.4798540546902585, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.4917, "step": 4715 }, { "epoch": 1.480167915571423, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 1.6797, "step": 4716 }, { "epoch": 1.4804817764525873, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 1.882, "step": 4717 }, { "epoch": 1.4807956373337519, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 1.7856, "step": 4718 }, { "epoch": 1.4811094982149162, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.4896, "step": 4719 }, { "epoch": 1.4814233590960806, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 1.7564, "step": 4720 }, { "epoch": 1.4817372199772452, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.6041, "step": 4721 }, { "epoch": 1.4820510808584095, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.6861, "step": 4722 }, { "epoch": 1.482364941739574, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.6922, "step": 4723 }, { "epoch": 1.4826788026207383, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 2.1272, "step": 4724 }, { "epoch": 1.4829926635019028, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 1.8977, "step": 4725 }, { "epoch": 1.4833065243830672, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 2.0812, "step": 4726 }, { "epoch": 1.4836203852642316, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 2.1398, "step": 4727 }, { "epoch": 1.4839342461453962, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 2.179, "step": 4728 }, { "epoch": 1.4842481070265605, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 1.8119, "step": 4729 }, { "epoch": 1.4845619679077249, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 2.6156, "step": 4730 }, { "epoch": 1.4848758287888892, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.9482, "step": 4731 }, { "epoch": 1.4851896896700538, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 1.5828, "step": 4732 }, { "epoch": 1.4855035505512182, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 1.5911, "step": 4733 }, { "epoch": 1.4858174114323826, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 1.7881, "step": 4734 }, { "epoch": 1.4861312723135471, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 1.8778, "step": 4735 }, { "epoch": 1.4864451331947115, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 2.6713, "step": 4736 }, { "epoch": 1.4867589940758759, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.5121, "step": 4737 }, { "epoch": 1.4870728549570402, "grad_norm": 0.1328125, "learning_rate": 0.0002, "loss": 1.3319, "step": 4738 }, { "epoch": 1.4873867158382046, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.267, "step": 4739 }, { "epoch": 1.4877005767193692, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5039, "step": 4740 }, { "epoch": 1.4880144376005335, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.1809, "step": 4741 }, { "epoch": 1.4883282984816981, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.2295, "step": 4742 }, { "epoch": 1.4886421593628625, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.3372, "step": 4743 }, { "epoch": 1.4889560202440268, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.2749, "step": 4744 }, { "epoch": 1.4892698811251912, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.1847, "step": 4745 }, { "epoch": 1.4895837420063556, "grad_norm": 0.203125, "learning_rate": 0.0002, "loss": 1.4192, "step": 4746 }, { "epoch": 1.4898976028875202, "grad_norm": 0.2138671875, "learning_rate": 0.0002, "loss": 1.283, "step": 4747 }, { "epoch": 1.4902114637686845, "grad_norm": 0.1875, "learning_rate": 0.0002, "loss": 1.4341, "step": 4748 }, { "epoch": 1.4905253246498489, "grad_norm": 0.1865234375, "learning_rate": 0.0002, "loss": 1.3149, "step": 4749 }, { "epoch": 1.4908391855310135, "grad_norm": 0.19921875, "learning_rate": 0.0002, "loss": 1.4504, "step": 4750 }, { "epoch": 1.4911530464121778, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.2171, "step": 4751 }, { "epoch": 1.4914669072933422, "grad_norm": 0.1943359375, "learning_rate": 0.0002, "loss": 1.3384, "step": 4752 }, { "epoch": 1.4917807681745066, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.4283, "step": 4753 }, { "epoch": 1.4920946290556711, "grad_norm": 0.2099609375, "learning_rate": 0.0002, "loss": 1.3798, "step": 4754 }, { "epoch": 1.4924084899368355, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.5737, "step": 4755 }, { "epoch": 1.4927223508179999, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.5947, "step": 4756 }, { "epoch": 1.4930362116991645, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.4335, "step": 4757 }, { "epoch": 1.4933500725803288, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.3925, "step": 4758 }, { "epoch": 1.4936639334614932, "grad_norm": 0.240234375, "learning_rate": 0.0002, "loss": 1.2464, "step": 4759 }, { "epoch": 1.4939777943426575, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 1.5044, "step": 4760 }, { "epoch": 1.4942916552238221, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.5591, "step": 4761 }, { "epoch": 1.4946055161049865, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 1.8971, "step": 4762 }, { "epoch": 1.4949193769861508, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.3473, "step": 4763 }, { "epoch": 1.4952332378673154, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 1.2611, "step": 4764 }, { "epoch": 1.4955470987484798, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.7825, "step": 4765 }, { "epoch": 1.4958609596296442, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.8171, "step": 4766 }, { "epoch": 1.4961748205108085, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 1.7894, "step": 4767 }, { "epoch": 1.4964886813919729, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 1.8907, "step": 4768 }, { "epoch": 1.4968025422731375, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 2.2738, "step": 4769 }, { "epoch": 1.4971164031543018, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 1.6945, "step": 4770 }, { "epoch": 1.4974302640354662, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 2.0429, "step": 4771 }, { "epoch": 1.4977441249166308, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 2.356, "step": 4772 }, { "epoch": 1.4980579857977951, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 1.576, "step": 4773 }, { "epoch": 1.4983718466789595, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 2.0138, "step": 4774 }, { "epoch": 1.4986857075601239, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 1.8323, "step": 4775 }, { "epoch": 1.4989995684412885, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 1.9431, "step": 4776 }, { "epoch": 1.4993134293224528, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 2.0335, "step": 4777 }, { "epoch": 1.4996272902036172, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 2.0601, "step": 4778 }, { "epoch": 1.4999411510847818, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 2.2626, "step": 4779 }, { "epoch": 1.5002550119659461, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 1.7814, "step": 4780 }, { "epoch": 1.5005688728471105, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 2.3872, "step": 4781 }, { "epoch": 1.5008827337282749, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 1.5471, "step": 4782 }, { "epoch": 1.5011965946094392, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.8402, "step": 4783 }, { "epoch": 1.5015104554906038, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 2.095, "step": 4784 }, { "epoch": 1.5018243163717682, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 2.2912, "step": 4785 }, { "epoch": 1.5021381772529327, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 2.6753, "step": 4786 }, { "epoch": 1.5024520381340971, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 1.626, "step": 4787 }, { "epoch": 1.5027658990152615, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.3211, "step": 4788 }, { "epoch": 1.5030797598964258, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.3371, "step": 4789 }, { "epoch": 1.5033936207775902, "grad_norm": 0.1884765625, "learning_rate": 0.0002, "loss": 1.4466, "step": 4790 }, { "epoch": 1.5037074816587548, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.4366, "step": 4791 }, { "epoch": 1.5040213425399191, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.3662, "step": 4792 }, { "epoch": 1.5043352034210837, "grad_norm": 0.197265625, "learning_rate": 0.0002, "loss": 1.5024, "step": 4793 }, { "epoch": 1.504649064302248, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.3876, "step": 4794 }, { "epoch": 1.5049629251834125, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.3244, "step": 4795 }, { "epoch": 1.5052767860645768, "grad_norm": 0.185546875, "learning_rate": 0.0002, "loss": 1.2709, "step": 4796 }, { "epoch": 1.5055906469457412, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.2322, "step": 4797 }, { "epoch": 1.5059045078269058, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.4231, "step": 4798 }, { "epoch": 1.5062183687080701, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.3405, "step": 4799 }, { "epoch": 1.5065322295892347, "grad_norm": 0.2001953125, "learning_rate": 0.0002, "loss": 1.2256, "step": 4800 }, { "epoch": 1.506846090470399, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.3446, "step": 4801 }, { "epoch": 1.5071599513515634, "grad_norm": 0.212890625, "learning_rate": 0.0002, "loss": 1.4563, "step": 4802 }, { "epoch": 1.5074738122327278, "grad_norm": 0.197265625, "learning_rate": 0.0002, "loss": 1.1671, "step": 4803 }, { "epoch": 1.5077876731138922, "grad_norm": 0.2060546875, "learning_rate": 0.0002, "loss": 1.3043, "step": 4804 }, { "epoch": 1.5081015339950565, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.5666, "step": 4805 }, { "epoch": 1.5084153948762211, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.4813, "step": 4806 }, { "epoch": 1.5087292557573857, "grad_norm": 0.2119140625, "learning_rate": 0.0002, "loss": 1.3034, "step": 4807 }, { "epoch": 1.50904311663855, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 1.4337, "step": 4808 }, { "epoch": 1.5093569775197144, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.5607, "step": 4809 }, { "epoch": 1.5096708384008788, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 1.5443, "step": 4810 }, { "epoch": 1.5099846992820432, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.595, "step": 4811 }, { "epoch": 1.5102985601632075, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.5278, "step": 4812 }, { "epoch": 1.510612421044372, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.5109, "step": 4813 }, { "epoch": 1.5109262819255365, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 1.5616, "step": 4814 }, { "epoch": 1.511240142806701, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.3786, "step": 4815 }, { "epoch": 1.5115540036878654, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 1.815, "step": 4816 }, { "epoch": 1.5118678645690298, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.4484, "step": 4817 }, { "epoch": 1.5121817254501941, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 1.7374, "step": 4818 }, { "epoch": 1.5124955863313585, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 1.615, "step": 4819 }, { "epoch": 1.512809447212523, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.6979, "step": 4820 }, { "epoch": 1.5131233080936874, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 2.2839, "step": 4821 }, { "epoch": 1.513437168974852, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 2.0101, "step": 4822 }, { "epoch": 1.5137510298560164, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.6714, "step": 4823 }, { "epoch": 1.5140648907371808, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 1.9403, "step": 4824 }, { "epoch": 1.5143787516183451, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 1.8463, "step": 4825 }, { "epoch": 1.5146926124995095, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.6704, "step": 4826 }, { "epoch": 1.515006473380674, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.7315, "step": 4827 }, { "epoch": 1.5153203342618384, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 1.6438, "step": 4828 }, { "epoch": 1.515634195143003, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 2.3559, "step": 4829 }, { "epoch": 1.5159480560241674, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 2.0338, "step": 4830 }, { "epoch": 1.5162619169053317, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.916, "step": 4831 }, { "epoch": 1.516575777786496, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.73, "step": 4832 }, { "epoch": 1.5168896386676605, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 1.8197, "step": 4833 }, { "epoch": 1.5172034995488248, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 1.7585, "step": 4834 }, { "epoch": 1.5175173604299894, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 1.8572, "step": 4835 }, { "epoch": 1.5178312213111538, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 2.4985, "step": 4836 }, { "epoch": 1.5181450821923184, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.5137, "step": 4837 }, { "epoch": 1.5184589430734827, "grad_norm": 0.1240234375, "learning_rate": 0.0002, "loss": 1.2961, "step": 4838 }, { "epoch": 1.518772803954647, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.4312, "step": 4839 }, { "epoch": 1.5190866648358115, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.4462, "step": 4840 }, { "epoch": 1.5194005257169758, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.3613, "step": 4841 }, { "epoch": 1.5197143865981404, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.3979, "step": 4842 }, { "epoch": 1.5200282474793048, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.4604, "step": 4843 }, { "epoch": 1.5203421083604693, "grad_norm": 0.185546875, "learning_rate": 0.0002, "loss": 1.3047, "step": 4844 }, { "epoch": 1.5206559692416337, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.2199, "step": 4845 }, { "epoch": 1.520969830122798, "grad_norm": 0.185546875, "learning_rate": 0.0002, "loss": 1.489, "step": 4846 }, { "epoch": 1.5212836910039624, "grad_norm": 0.2177734375, "learning_rate": 0.0002, "loss": 1.501, "step": 4847 }, { "epoch": 1.5215975518851268, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.4074, "step": 4848 }, { "epoch": 1.5219114127662914, "grad_norm": 0.1943359375, "learning_rate": 0.0002, "loss": 1.4732, "step": 4849 }, { "epoch": 1.5222252736474557, "grad_norm": 0.203125, "learning_rate": 0.0002, "loss": 1.2671, "step": 4850 }, { "epoch": 1.5225391345286203, "grad_norm": 0.208984375, "learning_rate": 0.0002, "loss": 1.2669, "step": 4851 }, { "epoch": 1.5228529954097847, "grad_norm": 0.220703125, "learning_rate": 0.0002, "loss": 1.3934, "step": 4852 }, { "epoch": 1.523166856290949, "grad_norm": 0.2001953125, "learning_rate": 0.0002, "loss": 1.3603, "step": 4853 }, { "epoch": 1.5234807171721134, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.3657, "step": 4854 }, { "epoch": 1.5237945780532778, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.3856, "step": 4855 }, { "epoch": 1.5241084389344421, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.3651, "step": 4856 }, { "epoch": 1.5244222998156067, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.4138, "step": 4857 }, { "epoch": 1.5247361606967713, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 1.5296, "step": 4858 }, { "epoch": 1.5250500215779357, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.4891, "step": 4859 }, { "epoch": 1.5253638824591, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 1.4156, "step": 4860 }, { "epoch": 1.5256777433402644, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.4966, "step": 4861 }, { "epoch": 1.5259916042214288, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 1.6988, "step": 4862 }, { "epoch": 1.5259916042214288, "eval_loss": 1.7795552015304565, "eval_runtime": 122.9144, "eval_samples_per_second": 8.136, "eval_steps_per_second": 8.136, "step": 4862 }, { "epoch": 1.5259916042214288, "mmlu_eval_accuracy": 0.40338534054114106, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.35714285714285715, "mmlu_eval_accuracy_astronomy": 0.4375, "mmlu_eval_accuracy_business_ethics": 0.6363636363636364, "mmlu_eval_accuracy_clinical_knowledge": 0.41379310344827586, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.36363636363636365, "mmlu_eval_accuracy_college_physics": 0.2727272727272727, "mmlu_eval_accuracy_computer_security": 0.18181818181818182, "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.375, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.35714285714285715, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.21875, "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, "mmlu_eval_accuracy_high_school_computer_science": 0.4444444444444444, "mmlu_eval_accuracy_high_school_european_history": 0.5555555555555556, "mmlu_eval_accuracy_high_school_geography": 0.6363636363636364, "mmlu_eval_accuracy_high_school_government_and_politics": 0.47619047619047616, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3023255813953488, "mmlu_eval_accuracy_high_school_mathematics": 0.2413793103448276, "mmlu_eval_accuracy_high_school_microeconomics": 0.2692307692307692, "mmlu_eval_accuracy_high_school_physics": 0.29411764705882354, "mmlu_eval_accuracy_high_school_psychology": 0.5666666666666667, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.5454545454545454, "mmlu_eval_accuracy_high_school_world_history": 0.38461538461538464, "mmlu_eval_accuracy_human_aging": 0.6956521739130435, "mmlu_eval_accuracy_human_sexuality": 0.25, "mmlu_eval_accuracy_international_law": 0.7692307692307693, "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, "mmlu_eval_accuracy_logical_fallacies": 0.6111111111111112, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.45454545454545453, "mmlu_eval_accuracy_marketing": 0.8, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.5813953488372093, "mmlu_eval_accuracy_moral_disputes": 0.39473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.25, "mmlu_eval_accuracy_nutrition": 0.42424242424242425, "mmlu_eval_accuracy_philosophy": 0.47058823529411764, "mmlu_eval_accuracy_prehistory": 0.2857142857142857, "mmlu_eval_accuracy_professional_accounting": 0.3548387096774194, "mmlu_eval_accuracy_professional_law": 0.29411764705882354, "mmlu_eval_accuracy_professional_medicine": 0.41935483870967744, "mmlu_eval_accuracy_professional_psychology": 0.37681159420289856, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.45454545454545453, "mmlu_eval_accuracy_us_foreign_policy": 0.5454545454545454, "mmlu_eval_accuracy_virology": 0.3888888888888889, "mmlu_eval_accuracy_world_religions": 0.6842105263157895, "mmlu_loss": 1.148823298742614, "step": 4862 }, { "epoch": 1.5263054651025931, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.4722, "step": 4863 }, { "epoch": 1.5266193259837577, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.3712, "step": 4864 }, { "epoch": 1.526933186864922, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 1.6423, "step": 4865 }, { "epoch": 1.5272470477460867, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 1.7239, "step": 4866 }, { "epoch": 1.527560908627251, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 1.5874, "step": 4867 }, { "epoch": 1.5278747695084154, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 2.0921, "step": 4868 }, { "epoch": 1.5281886303895797, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.8612, "step": 4869 }, { "epoch": 1.5285024912707441, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.812, "step": 4870 }, { "epoch": 1.5288163521519087, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 2.0088, "step": 4871 }, { "epoch": 1.529130213033073, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 2.0431, "step": 4872 }, { "epoch": 1.5294440739142376, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 2.3642, "step": 4873 }, { "epoch": 1.529757934795402, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.7782, "step": 4874 }, { "epoch": 1.5300717956765664, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 1.857, "step": 4875 }, { "epoch": 1.5303856565577307, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 1.7847, "step": 4876 }, { "epoch": 1.530699517438895, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 2.0318, "step": 4877 }, { "epoch": 1.5310133783200597, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 2.0986, "step": 4878 }, { "epoch": 1.531327239201224, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 1.8357, "step": 4879 }, { "epoch": 1.5316411000823886, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 2.2329, "step": 4880 }, { "epoch": 1.531954960963553, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 1.9576, "step": 4881 }, { "epoch": 1.5322688218447174, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 1.7508, "step": 4882 }, { "epoch": 1.5325826827258817, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 2.1047, "step": 4883 }, { "epoch": 1.532896543607046, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 2.0002, "step": 4884 }, { "epoch": 1.5332104044882104, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 2.0798, "step": 4885 }, { "epoch": 1.533524265369375, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 2.3123, "step": 4886 }, { "epoch": 1.5338381262505396, "grad_norm": 0.19140625, "learning_rate": 0.0002, "loss": 1.4414, "step": 4887 }, { "epoch": 1.534151987131704, "grad_norm": 0.130859375, "learning_rate": 0.0002, "loss": 1.3801, "step": 4888 }, { "epoch": 1.5344658480128683, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.3742, "step": 4889 }, { "epoch": 1.5347797088940327, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.2834, "step": 4890 }, { "epoch": 1.535093569775197, "grad_norm": 0.19921875, "learning_rate": 0.0002, "loss": 1.4661, "step": 4891 }, { "epoch": 1.5354074306563614, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.4743, "step": 4892 }, { "epoch": 1.535721291537526, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.3571, "step": 4893 }, { "epoch": 1.5360351524186904, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.3627, "step": 4894 }, { "epoch": 1.536349013299855, "grad_norm": 0.185546875, "learning_rate": 0.0002, "loss": 1.2071, "step": 4895 }, { "epoch": 1.5366628741810193, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.279, "step": 4896 }, { "epoch": 1.5369767350621837, "grad_norm": 0.1865234375, "learning_rate": 0.0002, "loss": 1.3222, "step": 4897 }, { "epoch": 1.537290595943348, "grad_norm": 0.205078125, "learning_rate": 0.0002, "loss": 1.2858, "step": 4898 }, { "epoch": 1.5376044568245124, "grad_norm": 0.185546875, "learning_rate": 0.0002, "loss": 1.4648, "step": 4899 }, { "epoch": 1.537918317705677, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.2091, "step": 4900 }, { "epoch": 1.5382321785868414, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.3099, "step": 4901 }, { "epoch": 1.538546039468006, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.2137, "step": 4902 }, { "epoch": 1.5388599003491703, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.294, "step": 4903 }, { "epoch": 1.5391737612303347, "grad_norm": 0.19921875, "learning_rate": 0.0002, "loss": 1.459, "step": 4904 }, { "epoch": 1.539487622111499, "grad_norm": 0.2392578125, "learning_rate": 0.0002, "loss": 1.3275, "step": 4905 }, { "epoch": 1.5398014829926634, "grad_norm": 0.1884765625, "learning_rate": 0.0002, "loss": 1.2145, "step": 4906 }, { "epoch": 1.540115343873828, "grad_norm": 0.2216796875, "learning_rate": 0.0002, "loss": 1.2768, "step": 4907 }, { "epoch": 1.5404292047549923, "grad_norm": 0.2109375, "learning_rate": 0.0002, "loss": 1.3009, "step": 4908 }, { "epoch": 1.540743065636157, "grad_norm": 0.234375, "learning_rate": 0.0002, "loss": 1.3927, "step": 4909 }, { "epoch": 1.5410569265173213, "grad_norm": 0.24609375, "learning_rate": 0.0002, "loss": 1.4026, "step": 4910 }, { "epoch": 1.5413707873984857, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 1.4493, "step": 4911 }, { "epoch": 1.54168464827965, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.5503, "step": 4912 }, { "epoch": 1.5419985091608144, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.3989, "step": 4913 }, { "epoch": 1.5423123700419787, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.4274, "step": 4914 }, { "epoch": 1.5426262309231433, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.3415, "step": 4915 }, { "epoch": 1.5429400918043077, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.4149, "step": 4916 }, { "epoch": 1.5432539526854723, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 1.6919, "step": 4917 }, { "epoch": 1.5435678135666366, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 1.7856, "step": 4918 }, { "epoch": 1.543881674447801, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 1.6884, "step": 4919 }, { "epoch": 1.5441955353289654, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.724, "step": 4920 }, { "epoch": 1.5445093962101297, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 1.9534, "step": 4921 }, { "epoch": 1.5448232570912943, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 2.2285, "step": 4922 }, { "epoch": 1.5451371179724587, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.8947, "step": 4923 }, { "epoch": 1.5454509788536233, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 1.6949, "step": 4924 }, { "epoch": 1.5457648397347876, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 2.0834, "step": 4925 }, { "epoch": 1.546078700615952, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 2.296, "step": 4926 }, { "epoch": 1.5463925614971163, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 2.245, "step": 4927 }, { "epoch": 1.5467064223782807, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 2.1643, "step": 4928 }, { "epoch": 1.5470202832594453, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 2.3892, "step": 4929 }, { "epoch": 1.5473341441406097, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 2.2535, "step": 4930 }, { "epoch": 1.5476480050217742, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 2.2135, "step": 4931 }, { "epoch": 1.5479618659029386, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 1.8875, "step": 4932 }, { "epoch": 1.548275726784103, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 2.284, "step": 4933 }, { "epoch": 1.5485895876652673, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.9798, "step": 4934 }, { "epoch": 1.5489034485464317, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 2.0749, "step": 4935 }, { "epoch": 1.549217309427596, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 2.5011, "step": 4936 }, { "epoch": 1.5495311703087606, "grad_norm": 0.208984375, "learning_rate": 0.0002, "loss": 1.4444, "step": 4937 }, { "epoch": 1.5498450311899252, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.3725, "step": 4938 }, { "epoch": 1.5501588920710896, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.136, "step": 4939 }, { "epoch": 1.550472752952254, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.4387, "step": 4940 }, { "epoch": 1.5507866138334183, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.3535, "step": 4941 }, { "epoch": 1.5511004747145827, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.2157, "step": 4942 }, { "epoch": 1.551414335595747, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.4201, "step": 4943 }, { "epoch": 1.5517281964769116, "grad_norm": 0.193359375, "learning_rate": 0.0002, "loss": 1.4822, "step": 4944 }, { "epoch": 1.552042057358076, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.3504, "step": 4945 }, { "epoch": 1.5523559182392406, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.2893, "step": 4946 }, { "epoch": 1.552669779120405, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.2906, "step": 4947 }, { "epoch": 1.5529836400015693, "grad_norm": 0.2060546875, "learning_rate": 0.0002, "loss": 1.312, "step": 4948 }, { "epoch": 1.5532975008827337, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.3696, "step": 4949 }, { "epoch": 1.553611361763898, "grad_norm": 0.189453125, "learning_rate": 0.0002, "loss": 1.4176, "step": 4950 }, { "epoch": 1.5539252226450626, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.4041, "step": 4951 }, { "epoch": 1.554239083526227, "grad_norm": 0.197265625, "learning_rate": 0.0002, "loss": 1.3403, "step": 4952 }, { "epoch": 1.5545529444073916, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.2961, "step": 4953 }, { "epoch": 1.554866805288556, "grad_norm": 0.2001953125, "learning_rate": 0.0002, "loss": 1.3889, "step": 4954 }, { "epoch": 1.5551806661697203, "grad_norm": 0.2158203125, "learning_rate": 0.0002, "loss": 1.5129, "step": 4955 }, { "epoch": 1.5554945270508846, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.2571, "step": 4956 }, { "epoch": 1.555808387932049, "grad_norm": 0.212890625, "learning_rate": 0.0002, "loss": 1.3474, "step": 4957 }, { "epoch": 1.5561222488132136, "grad_norm": 0.2255859375, "learning_rate": 0.0002, "loss": 1.3583, "step": 4958 }, { "epoch": 1.556436109694378, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.4329, "step": 4959 }, { "epoch": 1.5567499705755425, "grad_norm": 0.236328125, "learning_rate": 0.0002, "loss": 1.4391, "step": 4960 }, { "epoch": 1.557063831456707, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.4708, "step": 4961 }, { "epoch": 1.5573776923378713, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.4421, "step": 4962 }, { "epoch": 1.5576915532190356, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.4067, "step": 4963 }, { "epoch": 1.5580054141002, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 1.7318, "step": 4964 }, { "epoch": 1.5583192749813644, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 1.8052, "step": 4965 }, { "epoch": 1.558633135862529, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 1.6521, "step": 4966 }, { "epoch": 1.5589469967436933, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.8888, "step": 4967 }, { "epoch": 1.5592608576248579, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.9253, "step": 4968 }, { "epoch": 1.5595747185060223, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 1.6776, "step": 4969 }, { "epoch": 1.5598885793871866, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 1.9312, "step": 4970 }, { "epoch": 1.560202440268351, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 1.9008, "step": 4971 }, { "epoch": 1.5605163011495153, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 1.8292, "step": 4972 }, { "epoch": 1.56083016203068, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 1.8828, "step": 4973 }, { "epoch": 1.5611440229118443, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 2.0352, "step": 4974 }, { "epoch": 1.5614578837930089, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.9757, "step": 4975 }, { "epoch": 1.5617717446741732, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 2.2297, "step": 4976 }, { "epoch": 1.5620856055553376, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 1.9935, "step": 4977 }, { "epoch": 1.562399466436502, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 1.9956, "step": 4978 }, { "epoch": 1.5627133273176663, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 2.1654, "step": 4979 }, { "epoch": 1.563027188198831, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 2.2317, "step": 4980 }, { "epoch": 1.5633410490799953, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 1.6525, "step": 4981 }, { "epoch": 1.5636549099611599, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 1.9219, "step": 4982 }, { "epoch": 1.5639687708423242, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.8591, "step": 4983 }, { "epoch": 1.5642826317234886, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 2.1601, "step": 4984 }, { "epoch": 1.564596492604653, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 2.0061, "step": 4985 }, { "epoch": 1.5649103534858173, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 2.063, "step": 4986 }, { "epoch": 1.5652242143669817, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.7017, "step": 4987 }, { "epoch": 1.5655380752481463, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.3287, "step": 4988 }, { "epoch": 1.5658519361293108, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.2995, "step": 4989 }, { "epoch": 1.5661657970104752, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.3732, "step": 4990 }, { "epoch": 1.5664796578916396, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.3189, "step": 4991 }, { "epoch": 1.566793518772804, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.4269, "step": 4992 }, { "epoch": 1.5671073796539683, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.3134, "step": 4993 }, { "epoch": 1.5674212405351327, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.3434, "step": 4994 }, { "epoch": 1.5677351014162972, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.3721, "step": 4995 }, { "epoch": 1.5680489622974616, "grad_norm": 0.2158203125, "learning_rate": 0.0002, "loss": 1.3956, "step": 4996 }, { "epoch": 1.5683628231786262, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.4549, "step": 4997 }, { "epoch": 1.5686766840597905, "grad_norm": 0.1865234375, "learning_rate": 0.0002, "loss": 1.2518, "step": 4998 }, { "epoch": 1.568990544940955, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.3455, "step": 4999 }, { "epoch": 1.5693044058221193, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 1.3074, "step": 5000 }, { "epoch": 1.5696182667032836, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.2701, "step": 5001 }, { "epoch": 1.5699321275844482, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.242, "step": 5002 }, { "epoch": 1.5702459884656126, "grad_norm": 0.1884765625, "learning_rate": 0.0002, "loss": 1.2864, "step": 5003 }, { "epoch": 1.5705598493467772, "grad_norm": 0.2109375, "learning_rate": 0.0002, "loss": 1.3041, "step": 5004 }, { "epoch": 1.5708737102279415, "grad_norm": 0.220703125, "learning_rate": 0.0002, "loss": 1.3437, "step": 5005 }, { "epoch": 1.571187571109106, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.3458, "step": 5006 }, { "epoch": 1.5715014319902703, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.4165, "step": 5007 }, { "epoch": 1.5718152928714346, "grad_norm": 0.248046875, "learning_rate": 0.0002, "loss": 1.3758, "step": 5008 }, { "epoch": 1.5721291537525992, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.5677, "step": 5009 }, { "epoch": 1.5724430146337636, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.3738, "step": 5010 }, { "epoch": 1.5727568755149282, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 1.1974, "step": 5011 }, { "epoch": 1.5730707363960925, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.5345, "step": 5012 }, { "epoch": 1.5733845972772569, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.4702, "step": 5013 }, { "epoch": 1.5736984581584212, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 1.5037, "step": 5014 }, { "epoch": 1.5740123190395856, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.3174, "step": 5015 }, { "epoch": 1.57432617992075, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 1.7725, "step": 5016 }, { "epoch": 1.5746400408019146, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 1.7217, "step": 5017 }, { "epoch": 1.5749539016830791, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 2.1253, "step": 5018 }, { "epoch": 1.5752677625642435, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 1.7848, "step": 5019 }, { "epoch": 1.5755816234454079, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 1.8598, "step": 5020 }, { "epoch": 1.5758954843265722, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 1.9197, "step": 5021 }, { "epoch": 1.5762093452077366, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 1.8669, "step": 5022 }, { "epoch": 1.576523206088901, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 1.5558, "step": 5023 }, { "epoch": 1.5768370669700655, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 2.0621, "step": 5024 }, { "epoch": 1.57715092785123, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 2.0386, "step": 5025 }, { "epoch": 1.5774647887323945, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.8108, "step": 5026 }, { "epoch": 1.5777786496135588, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 1.9032, "step": 5027 }, { "epoch": 1.5780925104947232, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 2.4514, "step": 5028 }, { "epoch": 1.5784063713758876, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.7561, "step": 5029 }, { "epoch": 1.578720232257052, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 2.263, "step": 5030 }, { "epoch": 1.5790340931382165, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 1.9355, "step": 5031 }, { "epoch": 1.5793479540193809, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 1.7209, "step": 5032 }, { "epoch": 1.5796618149005455, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 1.574, "step": 5033 }, { "epoch": 1.5799756757817098, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.972, "step": 5034 }, { "epoch": 1.5802895366628742, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 2.0734, "step": 5035 }, { "epoch": 1.5806033975440386, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 2.6693, "step": 5036 }, { "epoch": 1.580917258425203, "grad_norm": 0.1845703125, "learning_rate": 0.0002, "loss": 1.3638, "step": 5037 }, { "epoch": 1.5812311193063675, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.2417, "step": 5038 }, { "epoch": 1.5815449801875319, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.2349, "step": 5039 }, { "epoch": 1.5818588410686965, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.3286, "step": 5040 }, { "epoch": 1.5821727019498608, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.3434, "step": 5041 }, { "epoch": 1.5824865628310252, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.4358, "step": 5042 }, { "epoch": 1.5828004237121895, "grad_norm": 0.19140625, "learning_rate": 0.0002, "loss": 1.3198, "step": 5043 }, { "epoch": 1.583114284593354, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.3207, "step": 5044 }, { "epoch": 1.5834281454745183, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.3239, "step": 5045 }, { "epoch": 1.5837420063556829, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.5401, "step": 5046 }, { "epoch": 1.5840558672368472, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.4877, "step": 5047 }, { "epoch": 1.5843697281180118, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.289, "step": 5048 }, { "epoch": 1.5846835889991762, "grad_norm": 0.189453125, "learning_rate": 0.0002, "loss": 1.3372, "step": 5049 }, { "epoch": 1.5846835889991762, "eval_loss": 1.7925392389297485, "eval_runtime": 123.6463, "eval_samples_per_second": 8.088, "eval_steps_per_second": 8.088, "step": 5049 }, { "epoch": 1.5846835889991762, "mmlu_eval_accuracy": 0.4240702509811583, "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.5625, "mmlu_eval_accuracy_business_ethics": 0.6363636363636364, "mmlu_eval_accuracy_clinical_knowledge": 0.3793103448275862, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364, "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, "mmlu_eval_accuracy_college_medicine": 0.3181818181818182, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.2727272727272727, "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.3125, "mmlu_eval_accuracy_elementary_mathematics": 0.36585365853658536, "mmlu_eval_accuracy_formal_logic": 0.35714285714285715, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.21875, "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, "mmlu_eval_accuracy_high_school_computer_science": 0.4444444444444444, "mmlu_eval_accuracy_high_school_european_history": 0.6666666666666666, "mmlu_eval_accuracy_high_school_geography": 0.6363636363636364, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5238095238095238, "mmlu_eval_accuracy_high_school_macroeconomics": 0.37209302325581395, "mmlu_eval_accuracy_high_school_mathematics": 0.2413793103448276, "mmlu_eval_accuracy_high_school_microeconomics": 0.23076923076923078, "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, "mmlu_eval_accuracy_high_school_psychology": 0.5666666666666667, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.5454545454545454, "mmlu_eval_accuracy_high_school_world_history": 0.46153846153846156, "mmlu_eval_accuracy_human_aging": 0.6956521739130435, "mmlu_eval_accuracy_human_sexuality": 0.25, "mmlu_eval_accuracy_international_law": 0.6923076923076923, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.6111111111111112, "mmlu_eval_accuracy_machine_learning": 0.18181818181818182, "mmlu_eval_accuracy_management": 0.45454545454545453, "mmlu_eval_accuracy_marketing": 0.72, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.5697674418604651, "mmlu_eval_accuracy_moral_disputes": 0.47368421052631576, "mmlu_eval_accuracy_moral_scenarios": 0.29, "mmlu_eval_accuracy_nutrition": 0.42424242424242425, "mmlu_eval_accuracy_philosophy": 0.5, "mmlu_eval_accuracy_prehistory": 0.3142857142857143, "mmlu_eval_accuracy_professional_accounting": 0.3870967741935484, "mmlu_eval_accuracy_professional_law": 0.28823529411764703, "mmlu_eval_accuracy_professional_medicine": 0.4838709677419355, "mmlu_eval_accuracy_professional_psychology": 0.391304347826087, "mmlu_eval_accuracy_public_relations": 0.5, "mmlu_eval_accuracy_security_studies": 0.4074074074074074, "mmlu_eval_accuracy_sociology": 0.5, "mmlu_eval_accuracy_us_foreign_policy": 0.6363636363636364, "mmlu_eval_accuracy_virology": 0.5, "mmlu_eval_accuracy_world_religions": 0.631578947368421, "mmlu_loss": 1.1857343875626034, "step": 5049 }, { "epoch": 1.5849974498803405, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.257, "step": 5050 }, { "epoch": 1.5853113107615049, "grad_norm": 0.2275390625, "learning_rate": 0.0002, "loss": 1.3983, "step": 5051 }, { "epoch": 1.5856251716426693, "grad_norm": 0.2099609375, "learning_rate": 0.0002, "loss": 1.3132, "step": 5052 }, { "epoch": 1.5859390325238338, "grad_norm": 0.21484375, "learning_rate": 0.0002, "loss": 1.2861, "step": 5053 }, { "epoch": 1.5862528934049982, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 1.4179, "step": 5054 }, { "epoch": 1.5865667542861628, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.3597, "step": 5055 }, { "epoch": 1.5868806151673271, "grad_norm": 0.212890625, "learning_rate": 0.0002, "loss": 1.2883, "step": 5056 }, { "epoch": 1.5871944760484915, "grad_norm": 0.236328125, "learning_rate": 0.0002, "loss": 1.4195, "step": 5057 }, { "epoch": 1.5875083369296559, "grad_norm": 0.2490234375, "learning_rate": 0.0002, "loss": 1.4079, "step": 5058 }, { "epoch": 1.5878221978108202, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.633, "step": 5059 }, { "epoch": 1.5881360586919848, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.2581, "step": 5060 }, { "epoch": 1.5884499195731492, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.4433, "step": 5061 }, { "epoch": 1.5887637804543138, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.3953, "step": 5062 }, { "epoch": 1.5890776413354781, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 1.3274, "step": 5063 }, { "epoch": 1.5893915022166425, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.3063, "step": 5064 }, { "epoch": 1.5897053630978069, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 1.6481, "step": 5065 }, { "epoch": 1.5900192239789712, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 1.7481, "step": 5066 }, { "epoch": 1.5903330848601356, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 1.5678, "step": 5067 }, { "epoch": 1.5906469457413002, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 1.8494, "step": 5068 }, { "epoch": 1.5909608066224648, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 1.6816, "step": 5069 }, { "epoch": 1.5912746675036291, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 1.3967, "step": 5070 }, { "epoch": 1.5915885283847935, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 1.588, "step": 5071 }, { "epoch": 1.5919023892659578, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 1.4999, "step": 5072 }, { "epoch": 1.5922162501471222, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 1.7389, "step": 5073 }, { "epoch": 1.5925301110282866, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 2.0021, "step": 5074 }, { "epoch": 1.5928439719094512, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 1.7741, "step": 5075 }, { "epoch": 1.5931578327906155, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 2.0061, "step": 5076 }, { "epoch": 1.59347169367178, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 2.3159, "step": 5077 }, { "epoch": 1.5937855545529445, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 2.0496, "step": 5078 }, { "epoch": 1.5940994154341088, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 2.0805, "step": 5079 }, { "epoch": 1.5944132763152732, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 2.3991, "step": 5080 }, { "epoch": 1.5947271371964375, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 1.6801, "step": 5081 }, { "epoch": 1.5950409980776021, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 2.0074, "step": 5082 }, { "epoch": 1.5953548589587665, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 1.8482, "step": 5083 }, { "epoch": 1.595668719839931, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.6638, "step": 5084 }, { "epoch": 1.5959825807210954, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.5203, "step": 5085 }, { "epoch": 1.5962964416022598, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 2.508, "step": 5086 }, { "epoch": 1.5966103024834242, "grad_norm": 0.203125, "learning_rate": 0.0002, "loss": 1.4785, "step": 5087 }, { "epoch": 1.5969241633645885, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.2313, "step": 5088 }, { "epoch": 1.5972380242457531, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.3661, "step": 5089 }, { "epoch": 1.5975518851269175, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.4161, "step": 5090 }, { "epoch": 1.597865746008082, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.3296, "step": 5091 }, { "epoch": 1.5981796068892464, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.2616, "step": 5092 }, { "epoch": 1.5984934677704108, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.3501, "step": 5093 }, { "epoch": 1.5988073286515752, "grad_norm": 0.1943359375, "learning_rate": 0.0002, "loss": 1.3956, "step": 5094 }, { "epoch": 1.5991211895327395, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.366, "step": 5095 }, { "epoch": 1.5994350504139039, "grad_norm": 0.185546875, "learning_rate": 0.0002, "loss": 1.4729, "step": 5096 }, { "epoch": 1.5997489112950685, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.3622, "step": 5097 }, { "epoch": 1.600062772176233, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.2904, "step": 5098 }, { "epoch": 1.6003766330573974, "grad_norm": 0.2177734375, "learning_rate": 0.0002, "loss": 1.337, "step": 5099 }, { "epoch": 1.6006904939385618, "grad_norm": 0.2099609375, "learning_rate": 0.0002, "loss": 1.3583, "step": 5100 }, { "epoch": 1.6010043548197261, "grad_norm": 0.208984375, "learning_rate": 0.0002, "loss": 1.3341, "step": 5101 }, { "epoch": 1.6013182157008905, "grad_norm": 0.203125, "learning_rate": 0.0002, "loss": 1.3488, "step": 5102 }, { "epoch": 1.6016320765820549, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.3594, "step": 5103 }, { "epoch": 1.6019459374632194, "grad_norm": 0.1953125, "learning_rate": 0.0002, "loss": 1.2418, "step": 5104 }, { "epoch": 1.6022597983443838, "grad_norm": 0.2265625, "learning_rate": 0.0002, "loss": 1.423, "step": 5105 }, { "epoch": 1.6025736592255484, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.5248, "step": 5106 }, { "epoch": 1.6028875201067128, "grad_norm": 0.220703125, "learning_rate": 0.0002, "loss": 1.3286, "step": 5107 }, { "epoch": 1.6032013809878771, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 1.406, "step": 5108 }, { "epoch": 1.6035152418690415, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.239, "step": 5109 }, { "epoch": 1.6038291027502058, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 1.4122, "step": 5110 }, { "epoch": 1.6041429636313704, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 1.6443, "step": 5111 }, { "epoch": 1.6044568245125348, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.4452, "step": 5112 }, { "epoch": 1.6047706853936994, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.2717, "step": 5113 }, { "epoch": 1.6050845462748637, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.4244, "step": 5114 }, { "epoch": 1.605398407156028, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 1.4921, "step": 5115 }, { "epoch": 1.6057122680371925, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 1.7972, "step": 5116 }, { "epoch": 1.6060261289183568, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 1.6309, "step": 5117 }, { "epoch": 1.6063399897995214, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 1.6465, "step": 5118 }, { "epoch": 1.6066538506806858, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 1.7099, "step": 5119 }, { "epoch": 1.6069677115618504, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 2.0473, "step": 5120 }, { "epoch": 1.6072815724430147, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 1.8316, "step": 5121 }, { "epoch": 1.607595433324179, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 1.9689, "step": 5122 }, { "epoch": 1.6079092942053435, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 1.8618, "step": 5123 }, { "epoch": 1.6082231550865078, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 1.8294, "step": 5124 }, { "epoch": 1.6085370159676722, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 2.2998, "step": 5125 }, { "epoch": 1.6088508768488368, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.9971, "step": 5126 }, { "epoch": 1.6091647377300011, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 1.8556, "step": 5127 }, { "epoch": 1.6094785986111657, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 1.8845, "step": 5128 }, { "epoch": 1.60979245949233, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 1.8863, "step": 5129 }, { "epoch": 1.6101063203734944, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 1.8496, "step": 5130 }, { "epoch": 1.6104201812546588, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 2.2089, "step": 5131 }, { "epoch": 1.6107340421358232, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 1.9057, "step": 5132 }, { "epoch": 1.6110479030169877, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 1.6346, "step": 5133 }, { "epoch": 1.611361763898152, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 1.6142, "step": 5134 }, { "epoch": 1.6116756247793167, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 2.1235, "step": 5135 }, { "epoch": 1.611989485660481, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 2.7421, "step": 5136 }, { "epoch": 1.6123033465416454, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.3866, "step": 5137 }, { "epoch": 1.6126172074228098, "grad_norm": 0.130859375, "learning_rate": 0.0002, "loss": 1.1569, "step": 5138 }, { "epoch": 1.6129310683039741, "grad_norm": 0.1328125, "learning_rate": 0.0002, "loss": 1.3513, "step": 5139 }, { "epoch": 1.6132449291851387, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.4872, "step": 5140 }, { "epoch": 1.613558790066303, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.4158, "step": 5141 }, { "epoch": 1.6138726509474677, "grad_norm": 0.2314453125, "learning_rate": 0.0002, "loss": 1.2856, "step": 5142 }, { "epoch": 1.614186511828632, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.4552, "step": 5143 }, { "epoch": 1.6145003727097964, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.3369, "step": 5144 }, { "epoch": 1.6148142335909608, "grad_norm": 0.1982421875, "learning_rate": 0.0002, "loss": 1.4883, "step": 5145 }, { "epoch": 1.6151280944721251, "grad_norm": 0.185546875, "learning_rate": 0.0002, "loss": 1.3296, "step": 5146 }, { "epoch": 1.6154419553532895, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.231, "step": 5147 }, { "epoch": 1.615755816234454, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.3855, "step": 5148 }, { "epoch": 1.6160696771156187, "grad_norm": 0.205078125, "learning_rate": 0.0002, "loss": 1.3475, "step": 5149 }, { "epoch": 1.616383537996783, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.1748, "step": 5150 }, { "epoch": 1.6166973988779474, "grad_norm": 0.203125, "learning_rate": 0.0002, "loss": 1.4136, "step": 5151 }, { "epoch": 1.6170112597591118, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.227, "step": 5152 }, { "epoch": 1.6173251206402761, "grad_norm": 0.2001953125, "learning_rate": 0.0002, "loss": 1.2219, "step": 5153 }, { "epoch": 1.6176389815214405, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.3647, "step": 5154 }, { "epoch": 1.617952842402605, "grad_norm": 0.2138671875, "learning_rate": 0.0002, "loss": 1.3262, "step": 5155 }, { "epoch": 1.6182667032837694, "grad_norm": 0.21484375, "learning_rate": 0.0002, "loss": 1.39, "step": 5156 }, { "epoch": 1.618580564164934, "grad_norm": 0.2109375, "learning_rate": 0.0002, "loss": 1.299, "step": 5157 }, { "epoch": 1.6188944250460984, "grad_norm": 0.2197265625, "learning_rate": 0.0002, "loss": 1.294, "step": 5158 }, { "epoch": 1.6192082859272627, "grad_norm": 0.2392578125, "learning_rate": 0.0002, "loss": 1.3709, "step": 5159 }, { "epoch": 1.619522146808427, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 1.5415, "step": 5160 }, { "epoch": 1.6198360076895915, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 1.688, "step": 5161 }, { "epoch": 1.620149868570756, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.6558, "step": 5162 }, { "epoch": 1.6204637294519204, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.7029, "step": 5163 }, { "epoch": 1.620777590333085, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 1.5837, "step": 5164 }, { "epoch": 1.6210914512142494, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 1.7758, "step": 5165 }, { "epoch": 1.6214053120954137, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 1.6964, "step": 5166 }, { "epoch": 1.621719172976578, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.4762, "step": 5167 }, { "epoch": 1.6220330338577424, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.5627, "step": 5168 }, { "epoch": 1.622346894738907, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 1.9072, "step": 5169 }, { "epoch": 1.6226607556200714, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 1.5664, "step": 5170 }, { "epoch": 1.622974616501236, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 1.5761, "step": 5171 }, { "epoch": 1.6232884773824003, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 1.9573, "step": 5172 }, { "epoch": 1.6236023382635647, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 2.001, "step": 5173 }, { "epoch": 1.623916199144729, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 2.3235, "step": 5174 }, { "epoch": 1.6242300600258934, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 2.2949, "step": 5175 }, { "epoch": 1.6245439209070578, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 2.0129, "step": 5176 }, { "epoch": 1.6248577817882224, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 2.0571, "step": 5177 }, { "epoch": 1.6251716426693867, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 2.3679, "step": 5178 }, { "epoch": 1.6254855035505513, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 1.7976, "step": 5179 }, { "epoch": 1.6257993644317157, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 2.1299, "step": 5180 }, { "epoch": 1.62611322531288, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.7216, "step": 5181 }, { "epoch": 1.6264270861940444, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.518, "step": 5182 }, { "epoch": 1.6267409470752088, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 2.1603, "step": 5183 }, { "epoch": 1.6270548079563734, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 1.9139, "step": 5184 }, { "epoch": 1.6273686688375377, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 2.3273, "step": 5185 }, { "epoch": 1.6276825297187023, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 2.4695, "step": 5186 }, { "epoch": 1.6279963905998667, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.4228, "step": 5187 }, { "epoch": 1.628310251481031, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.3978, "step": 5188 }, { "epoch": 1.6286241123621954, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.3138, "step": 5189 }, { "epoch": 1.6289379732433598, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.39, "step": 5190 }, { "epoch": 1.6292518341245243, "grad_norm": 0.189453125, "learning_rate": 0.0002, "loss": 1.4673, "step": 5191 }, { "epoch": 1.6295656950056887, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.3023, "step": 5192 }, { "epoch": 1.6298795558868533, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.2163, "step": 5193 }, { "epoch": 1.6301934167680177, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.27, "step": 5194 }, { "epoch": 1.630507277649182, "grad_norm": 0.185546875, "learning_rate": 0.0002, "loss": 1.4509, "step": 5195 }, { "epoch": 1.6308211385303464, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.344, "step": 5196 }, { "epoch": 1.6311349994115107, "grad_norm": 0.1865234375, "learning_rate": 0.0002, "loss": 1.4739, "step": 5197 }, { "epoch": 1.631448860292675, "grad_norm": 0.189453125, "learning_rate": 0.0002, "loss": 1.329, "step": 5198 }, { "epoch": 1.6317627211738397, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.3776, "step": 5199 }, { "epoch": 1.6320765820550043, "grad_norm": 0.22265625, "learning_rate": 0.0002, "loss": 1.3536, "step": 5200 }, { "epoch": 1.6323904429361686, "grad_norm": 0.1845703125, "learning_rate": 0.0002, "loss": 1.2967, "step": 5201 }, { "epoch": 1.632704303817333, "grad_norm": 0.1923828125, "learning_rate": 0.0002, "loss": 1.2148, "step": 5202 }, { "epoch": 1.6330181646984974, "grad_norm": 0.185546875, "learning_rate": 0.0002, "loss": 1.1922, "step": 5203 }, { "epoch": 1.6333320255796617, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.4999, "step": 5204 }, { "epoch": 1.633645886460826, "grad_norm": 0.224609375, "learning_rate": 0.0002, "loss": 1.2931, "step": 5205 }, { "epoch": 1.6339597473419907, "grad_norm": 0.2412109375, "learning_rate": 0.0002, "loss": 1.418, "step": 5206 }, { "epoch": 1.634273608223155, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 1.4777, "step": 5207 }, { "epoch": 1.6345874691043196, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.3601, "step": 5208 }, { "epoch": 1.634901329985484, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.3026, "step": 5209 }, { "epoch": 1.6352151908666483, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.4258, "step": 5210 }, { "epoch": 1.6355290517478127, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.3424, "step": 5211 }, { "epoch": 1.635842912628977, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.6434, "step": 5212 }, { "epoch": 1.6361567735101417, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.4022, "step": 5213 }, { "epoch": 1.636470634391306, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 1.5548, "step": 5214 }, { "epoch": 1.6367844952724706, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 1.4238, "step": 5215 }, { "epoch": 1.637098356153635, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 1.6629, "step": 5216 }, { "epoch": 1.6374122170347993, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 1.7608, "step": 5217 }, { "epoch": 1.6377260779159637, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.7776, "step": 5218 }, { "epoch": 1.638039938797128, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.5472, "step": 5219 }, { "epoch": 1.6383537996782926, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 1.7492, "step": 5220 }, { "epoch": 1.638667660559457, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 2.0288, "step": 5221 }, { "epoch": 1.6389815214406216, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 2.2096, "step": 5222 }, { "epoch": 1.639295382321786, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 2.0986, "step": 5223 }, { "epoch": 1.6396092432029503, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 1.6296, "step": 5224 }, { "epoch": 1.6399231040841147, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.9282, "step": 5225 }, { "epoch": 1.640236964965279, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.9752, "step": 5226 }, { "epoch": 1.6405508258464434, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 2.299, "step": 5227 }, { "epoch": 1.640864686727608, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 1.8258, "step": 5228 }, { "epoch": 1.6411785476087726, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 1.6818, "step": 5229 }, { "epoch": 1.641492408489937, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 1.6708, "step": 5230 }, { "epoch": 1.6418062693711013, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 2.13, "step": 5231 }, { "epoch": 1.6421201302522657, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 1.9442, "step": 5232 }, { "epoch": 1.64243399113343, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 1.9665, "step": 5233 }, { "epoch": 1.6427478520145944, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 1.8521, "step": 5234 }, { "epoch": 1.643061712895759, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 1.8763, "step": 5235 }, { "epoch": 1.6433755737769233, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 2.6208, "step": 5236 }, { "epoch": 1.6433755737769233, "eval_loss": 1.7919257879257202, "eval_runtime": 146.8008, "eval_samples_per_second": 6.812, "eval_steps_per_second": 6.812, "step": 5236 }, { "epoch": 1.6433755737769233, "mmlu_eval_accuracy": 0.4046188198456904, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.6363636363636364, "mmlu_eval_accuracy_clinical_knowledge": 0.3793103448275862, "mmlu_eval_accuracy_college_biology": 0.25, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.36363636363636365, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.2727272727272727, "mmlu_eval_accuracy_conceptual_physics": 0.34615384615384615, "mmlu_eval_accuracy_econometrics": 0.25, "mmlu_eval_accuracy_electrical_engineering": 0.375, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.25, "mmlu_eval_accuracy_high_school_chemistry": 0.3181818181818182, "mmlu_eval_accuracy_high_school_computer_science": 0.4444444444444444, "mmlu_eval_accuracy_high_school_european_history": 0.6111111111111112, "mmlu_eval_accuracy_high_school_geography": 0.5909090909090909, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5238095238095238, "mmlu_eval_accuracy_high_school_macroeconomics": 0.4186046511627907, "mmlu_eval_accuracy_high_school_mathematics": 0.20689655172413793, "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.5833333333333334, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.4090909090909091, "mmlu_eval_accuracy_high_school_world_history": 0.5384615384615384, "mmlu_eval_accuracy_human_aging": 0.6521739130434783, "mmlu_eval_accuracy_human_sexuality": 0.3333333333333333, "mmlu_eval_accuracy_international_law": 0.6923076923076923, "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, "mmlu_eval_accuracy_logical_fallacies": 0.5, "mmlu_eval_accuracy_machine_learning": 0.09090909090909091, "mmlu_eval_accuracy_management": 0.5454545454545454, "mmlu_eval_accuracy_marketing": 0.68, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.5581395348837209, "mmlu_eval_accuracy_moral_disputes": 0.5, "mmlu_eval_accuracy_moral_scenarios": 0.23, "mmlu_eval_accuracy_nutrition": 0.5151515151515151, "mmlu_eval_accuracy_philosophy": 0.38235294117647056, "mmlu_eval_accuracy_prehistory": 0.37142857142857144, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.31176470588235294, "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, "mmlu_eval_accuracy_professional_psychology": 0.34782608695652173, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.45454545454545453, "mmlu_eval_accuracy_us_foreign_policy": 0.5454545454545454, "mmlu_eval_accuracy_virology": 0.3888888888888889, "mmlu_eval_accuracy_world_religions": 0.5789473684210527, "mmlu_loss": 1.0532882166679665, "step": 5236 }, { "epoch": 1.643689434658088, "grad_norm": 0.1923828125, "learning_rate": 0.0002, "loss": 1.5944, "step": 5237 }, { "epoch": 1.6440032955392523, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.3256, "step": 5238 }, { "epoch": 1.6443171564204166, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.4149, "step": 5239 }, { "epoch": 1.644631017301581, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.32, "step": 5240 }, { "epoch": 1.6449448781827454, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.472, "step": 5241 }, { "epoch": 1.64525873906391, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.4399, "step": 5242 }, { "epoch": 1.6455725999450743, "grad_norm": 0.1923828125, "learning_rate": 0.0002, "loss": 1.4195, "step": 5243 }, { "epoch": 1.645886460826239, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.3585, "step": 5244 }, { "epoch": 1.6462003217074033, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.5199, "step": 5245 }, { "epoch": 1.6465141825885676, "grad_norm": 0.2099609375, "learning_rate": 0.0002, "loss": 1.4195, "step": 5246 }, { "epoch": 1.646828043469732, "grad_norm": 0.197265625, "learning_rate": 0.0002, "loss": 1.3701, "step": 5247 }, { "epoch": 1.6471419043508964, "grad_norm": 0.1875, "learning_rate": 0.0002, "loss": 1.3268, "step": 5248 }, { "epoch": 1.647455765232061, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.2806, "step": 5249 }, { "epoch": 1.6477696261132253, "grad_norm": 0.1884765625, "learning_rate": 0.0002, "loss": 1.4121, "step": 5250 }, { "epoch": 1.64808348699439, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.2581, "step": 5251 }, { "epoch": 1.6483973478755543, "grad_norm": 0.2412109375, "learning_rate": 0.0002, "loss": 1.4176, "step": 5252 }, { "epoch": 1.6487112087567186, "grad_norm": 0.240234375, "learning_rate": 0.0002, "loss": 1.6702, "step": 5253 }, { "epoch": 1.649025069637883, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.374, "step": 5254 }, { "epoch": 1.6493389305190473, "grad_norm": 0.208984375, "learning_rate": 0.0002, "loss": 1.1493, "step": 5255 }, { "epoch": 1.6496527914002117, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.4382, "step": 5256 }, { "epoch": 1.6499666522813763, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.4023, "step": 5257 }, { "epoch": 1.6502805131625407, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.5046, "step": 5258 }, { "epoch": 1.6505943740437052, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.4378, "step": 5259 }, { "epoch": 1.6509082349248696, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 1.6153, "step": 5260 }, { "epoch": 1.651222095806034, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.4205, "step": 5261 }, { "epoch": 1.6515359566871983, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 1.6033, "step": 5262 }, { "epoch": 1.6518498175683627, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 1.8266, "step": 5263 }, { "epoch": 1.6521636784495273, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.611, "step": 5264 }, { "epoch": 1.6524775393306916, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 1.5343, "step": 5265 }, { "epoch": 1.6527914002118562, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 2.091, "step": 5266 }, { "epoch": 1.6531052610930206, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 1.8864, "step": 5267 }, { "epoch": 1.653419121974185, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 1.8913, "step": 5268 }, { "epoch": 1.6537329828553493, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 1.6002, "step": 5269 }, { "epoch": 1.6540468437365137, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 1.7484, "step": 5270 }, { "epoch": 1.6543607046176783, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 2.0815, "step": 5271 }, { "epoch": 1.6546745654988426, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 2.0333, "step": 5272 }, { "epoch": 1.6549884263800072, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.9842, "step": 5273 }, { "epoch": 1.6553022872611716, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 2.1703, "step": 5274 }, { "epoch": 1.655616148142336, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 2.2295, "step": 5275 }, { "epoch": 1.6559300090235003, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 1.9882, "step": 5276 }, { "epoch": 1.6562438699046647, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 2.1352, "step": 5277 }, { "epoch": 1.656557730785829, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 2.1654, "step": 5278 }, { "epoch": 1.6568715916669936, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 2.0885, "step": 5279 }, { "epoch": 1.6571854525481582, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.9674, "step": 5280 }, { "epoch": 1.6574993134293226, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 2.0082, "step": 5281 }, { "epoch": 1.657813174310487, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 2.0707, "step": 5282 }, { "epoch": 1.6581270351916513, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 1.8177, "step": 5283 }, { "epoch": 1.6584408960728156, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 1.8653, "step": 5284 }, { "epoch": 1.65875475695398, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 1.8444, "step": 5285 }, { "epoch": 1.6590686178351446, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 2.4233, "step": 5286 }, { "epoch": 1.659382478716309, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.4098, "step": 5287 }, { "epoch": 1.6596963395974735, "grad_norm": 0.1318359375, "learning_rate": 0.0002, "loss": 1.4043, "step": 5288 }, { "epoch": 1.660010200478638, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.3667, "step": 5289 }, { "epoch": 1.6603240613598023, "grad_norm": 0.19921875, "learning_rate": 0.0002, "loss": 1.4895, "step": 5290 }, { "epoch": 1.6606379222409666, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.3081, "step": 5291 }, { "epoch": 1.660951783122131, "grad_norm": 0.2255859375, "learning_rate": 0.0002, "loss": 1.408, "step": 5292 }, { "epoch": 1.6612656440032956, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.4971, "step": 5293 }, { "epoch": 1.66157950488446, "grad_norm": 0.2265625, "learning_rate": 0.0002, "loss": 1.4166, "step": 5294 }, { "epoch": 1.6618933657656245, "grad_norm": 0.19140625, "learning_rate": 0.0002, "loss": 1.5222, "step": 5295 }, { "epoch": 1.6622072266467889, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 1.3026, "step": 5296 }, { "epoch": 1.6625210875279532, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.3182, "step": 5297 }, { "epoch": 1.6628349484091176, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.4146, "step": 5298 }, { "epoch": 1.663148809290282, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.3603, "step": 5299 }, { "epoch": 1.6634626701714466, "grad_norm": 0.193359375, "learning_rate": 0.0002, "loss": 1.4746, "step": 5300 }, { "epoch": 1.663776531052611, "grad_norm": 0.1875, "learning_rate": 0.0002, "loss": 1.2972, "step": 5301 }, { "epoch": 1.6640903919337755, "grad_norm": 0.2099609375, "learning_rate": 0.0002, "loss": 1.2751, "step": 5302 }, { "epoch": 1.6644042528149399, "grad_norm": 0.2265625, "learning_rate": 0.0002, "loss": 1.345, "step": 5303 }, { "epoch": 1.6647181136961042, "grad_norm": 0.24609375, "learning_rate": 0.0002, "loss": 1.3429, "step": 5304 }, { "epoch": 1.6650319745772686, "grad_norm": 0.2099609375, "learning_rate": 0.0002, "loss": 1.188, "step": 5305 }, { "epoch": 1.665345835458433, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.2375, "step": 5306 }, { "epoch": 1.6656596963395973, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.4213, "step": 5307 }, { "epoch": 1.665973557220762, "grad_norm": 0.2255859375, "learning_rate": 0.0002, "loss": 1.1308, "step": 5308 }, { "epoch": 1.6662874181019265, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.3391, "step": 5309 }, { "epoch": 1.6666012789830909, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.5578, "step": 5310 }, { "epoch": 1.6669151398642552, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 1.7623, "step": 5311 }, { "epoch": 1.6672290007454196, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 1.6784, "step": 5312 }, { "epoch": 1.667542861626584, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 1.4349, "step": 5313 }, { "epoch": 1.6678567225077483, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.6305, "step": 5314 }, { "epoch": 1.6681705833889129, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 1.8189, "step": 5315 }, { "epoch": 1.6684844442700772, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 2.0669, "step": 5316 }, { "epoch": 1.6687983051512418, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.725, "step": 5317 }, { "epoch": 1.6691121660324062, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.9341, "step": 5318 }, { "epoch": 1.6694260269135706, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.7933, "step": 5319 }, { "epoch": 1.669739887794735, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 1.5582, "step": 5320 }, { "epoch": 1.6700537486758993, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 2.1427, "step": 5321 }, { "epoch": 1.6703676095570639, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.9629, "step": 5322 }, { "epoch": 1.6706814704382282, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 2.0382, "step": 5323 }, { "epoch": 1.6709953313193928, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 2.1086, "step": 5324 }, { "epoch": 1.6713091922005572, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 1.6112, "step": 5325 }, { "epoch": 1.6716230530817215, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.9886, "step": 5326 }, { "epoch": 1.671936913962886, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 2.3365, "step": 5327 }, { "epoch": 1.6722507748440503, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 2.1497, "step": 5328 }, { "epoch": 1.6725646357252146, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 2.1876, "step": 5329 }, { "epoch": 1.6728784966063792, "grad_norm": 0.8828125, "learning_rate": 0.0002, "loss": 1.9152, "step": 5330 }, { "epoch": 1.6731923574875438, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 1.9573, "step": 5331 }, { "epoch": 1.6735062183687082, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 1.8511, "step": 5332 }, { "epoch": 1.6738200792498725, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 1.4428, "step": 5333 }, { "epoch": 1.674133940131037, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 1.5779, "step": 5334 }, { "epoch": 1.6744478010122013, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 2.4024, "step": 5335 }, { "epoch": 1.6747616618933656, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 2.1432, "step": 5336 }, { "epoch": 1.6750755227745302, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 1.4491, "step": 5337 }, { "epoch": 1.6753893836556946, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.4891, "step": 5338 }, { "epoch": 1.6757032445368591, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.4065, "step": 5339 }, { "epoch": 1.6760171054180235, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.3976, "step": 5340 }, { "epoch": 1.6763309662991879, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.2961, "step": 5341 }, { "epoch": 1.6766448271803522, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.4172, "step": 5342 }, { "epoch": 1.6769586880615166, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.343, "step": 5343 }, { "epoch": 1.6772725489426812, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.3408, "step": 5344 }, { "epoch": 1.6775864098238455, "grad_norm": 0.2021484375, "learning_rate": 0.0002, "loss": 1.4977, "step": 5345 }, { "epoch": 1.6779002707050101, "grad_norm": 0.1884765625, "learning_rate": 0.0002, "loss": 1.4584, "step": 5346 }, { "epoch": 1.6782141315861745, "grad_norm": 0.1982421875, "learning_rate": 0.0002, "loss": 1.4354, "step": 5347 }, { "epoch": 1.6785279924673389, "grad_norm": 0.236328125, "learning_rate": 0.0002, "loss": 1.3733, "step": 5348 }, { "epoch": 1.6788418533485032, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.2364, "step": 5349 }, { "epoch": 1.6791557142296676, "grad_norm": 0.2158203125, "learning_rate": 0.0002, "loss": 1.2784, "step": 5350 }, { "epoch": 1.6794695751108322, "grad_norm": 0.2001953125, "learning_rate": 0.0002, "loss": 1.4247, "step": 5351 }, { "epoch": 1.6797834359919965, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.2141, "step": 5352 }, { "epoch": 1.6800972968731611, "grad_norm": 0.2177734375, "learning_rate": 0.0002, "loss": 1.3006, "step": 5353 }, { "epoch": 1.6804111577543255, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.3487, "step": 5354 }, { "epoch": 1.6807250186354898, "grad_norm": 0.220703125, "learning_rate": 0.0002, "loss": 1.2892, "step": 5355 }, { "epoch": 1.6810388795166542, "grad_norm": 0.234375, "learning_rate": 0.0002, "loss": 1.3255, "step": 5356 }, { "epoch": 1.6813527403978186, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.6207, "step": 5357 }, { "epoch": 1.681666601278983, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.6852, "step": 5358 }, { "epoch": 1.6819804621601475, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.3062, "step": 5359 }, { "epoch": 1.682294323041312, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 1.4352, "step": 5360 }, { "epoch": 1.6826081839224765, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 1.4089, "step": 5361 }, { "epoch": 1.6829220448036408, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.4615, "step": 5362 }, { "epoch": 1.6832359056848052, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 1.7598, "step": 5363 }, { "epoch": 1.6835497665659696, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.9255, "step": 5364 }, { "epoch": 1.683863627447134, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.4791, "step": 5365 }, { "epoch": 1.6841774883282985, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 1.8529, "step": 5366 }, { "epoch": 1.6844913492094629, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 1.6055, "step": 5367 }, { "epoch": 1.6848052100906274, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.7601, "step": 5368 }, { "epoch": 1.6851190709717918, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 2.0651, "step": 5369 }, { "epoch": 1.6854329318529562, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 1.8065, "step": 5370 }, { "epoch": 1.6857467927341205, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 1.9603, "step": 5371 }, { "epoch": 1.686060653615285, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 1.8112, "step": 5372 }, { "epoch": 1.6863745144964495, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 1.5999, "step": 5373 }, { "epoch": 1.6866883753776138, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 2.1754, "step": 5374 }, { "epoch": 1.6870022362587784, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 1.7879, "step": 5375 }, { "epoch": 1.6873160971399428, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.9381, "step": 5376 }, { "epoch": 1.6876299580211072, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 1.6776, "step": 5377 }, { "epoch": 1.6879438189022715, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 2.0856, "step": 5378 }, { "epoch": 1.6882576797834359, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 1.8692, "step": 5379 }, { "epoch": 1.6885715406646005, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 2.1128, "step": 5380 }, { "epoch": 1.6888854015457648, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 2.2145, "step": 5381 }, { "epoch": 1.6891992624269294, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.8243, "step": 5382 }, { "epoch": 1.6895131233080938, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 1.8438, "step": 5383 }, { "epoch": 1.6898269841892581, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 2.1272, "step": 5384 }, { "epoch": 1.6901408450704225, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 2.3897, "step": 5385 }, { "epoch": 1.6904547059515869, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 2.5437, "step": 5386 }, { "epoch": 1.6907685668327512, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 1.4554, "step": 5387 }, { "epoch": 1.6910824277139158, "grad_norm": 0.1279296875, "learning_rate": 0.0002, "loss": 1.4041, "step": 5388 }, { "epoch": 1.6913962885950802, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.3059, "step": 5389 }, { "epoch": 1.6917101494762448, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.4077, "step": 5390 }, { "epoch": 1.6920240103574091, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.4185, "step": 5391 }, { "epoch": 1.6923378712385735, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.4299, "step": 5392 }, { "epoch": 1.6926517321197379, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.3579, "step": 5393 }, { "epoch": 1.6929655930009022, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.4735, "step": 5394 }, { "epoch": 1.6932794538820668, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.3569, "step": 5395 }, { "epoch": 1.6935933147632312, "grad_norm": 0.2255859375, "learning_rate": 0.0002, "loss": 1.5498, "step": 5396 }, { "epoch": 1.6939071756443957, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 1.429, "step": 5397 }, { "epoch": 1.69422103652556, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.4454, "step": 5398 }, { "epoch": 1.6945348974067245, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.2722, "step": 5399 }, { "epoch": 1.6948487582878888, "grad_norm": 0.2041015625, "learning_rate": 0.0002, "loss": 1.2964, "step": 5400 }, { "epoch": 1.6951626191690532, "grad_norm": 0.193359375, "learning_rate": 0.0002, "loss": 1.3567, "step": 5401 }, { "epoch": 1.6954764800502178, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.3113, "step": 5402 }, { "epoch": 1.6957903409313821, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.2178, "step": 5403 }, { "epoch": 1.6961042018125467, "grad_norm": 0.2177734375, "learning_rate": 0.0002, "loss": 1.377, "step": 5404 }, { "epoch": 1.696418062693711, "grad_norm": 0.2060546875, "learning_rate": 0.0002, "loss": 1.3427, "step": 5405 }, { "epoch": 1.6967319235748755, "grad_norm": 0.232421875, "learning_rate": 0.0002, "loss": 1.4818, "step": 5406 }, { "epoch": 1.6970457844560398, "grad_norm": 0.2265625, "learning_rate": 0.0002, "loss": 1.3405, "step": 5407 }, { "epoch": 1.6973596453372042, "grad_norm": 0.240234375, "learning_rate": 0.0002, "loss": 1.3294, "step": 5408 }, { "epoch": 1.6976735062183685, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.4936, "step": 5409 }, { "epoch": 1.6979873670995331, "grad_norm": 0.232421875, "learning_rate": 0.0002, "loss": 1.282, "step": 5410 }, { "epoch": 1.6983012279806977, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.6205, "step": 5411 }, { "epoch": 1.698615088861862, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.457, "step": 5412 }, { "epoch": 1.6989289497430264, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.4012, "step": 5413 }, { "epoch": 1.6992428106241908, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.4042, "step": 5414 }, { "epoch": 1.6995566715053552, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 1.5957, "step": 5415 }, { "epoch": 1.6998705323865195, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.6471, "step": 5416 }, { "epoch": 1.7001843932676841, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 1.9019, "step": 5417 }, { "epoch": 1.7004982541488485, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.5745, "step": 5418 }, { "epoch": 1.700812115030013, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 1.845, "step": 5419 }, { "epoch": 1.7011259759111774, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 2.0496, "step": 5420 }, { "epoch": 1.7014398367923418, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.7222, "step": 5421 }, { "epoch": 1.7017536976735061, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 1.8778, "step": 5422 }, { "epoch": 1.7020675585546705, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.7047, "step": 5423 }, { "epoch": 1.7020675585546705, "eval_loss": 1.7857495546340942, "eval_runtime": 147.0108, "eval_samples_per_second": 6.802, "eval_steps_per_second": 6.802, "step": 5423 }, { "epoch": 1.7020675585546705, "mmlu_eval_accuracy": 0.4185827138010515, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.6363636363636364, "mmlu_eval_accuracy_clinical_knowledge": 0.3448275862068966, "mmlu_eval_accuracy_college_biology": 0.25, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.2727272727272727, "mmlu_eval_accuracy_conceptual_physics": 0.34615384615384615, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.3125, "mmlu_eval_accuracy_elementary_mathematics": 0.3170731707317073, "mmlu_eval_accuracy_formal_logic": 0.35714285714285715, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.28125, "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666, "mmlu_eval_accuracy_high_school_european_history": 0.6666666666666666, "mmlu_eval_accuracy_high_school_geography": 0.5454545454545454, "mmlu_eval_accuracy_high_school_government_and_politics": 0.6190476190476191, "mmlu_eval_accuracy_high_school_macroeconomics": 0.37209302325581395, "mmlu_eval_accuracy_high_school_mathematics": 0.27586206896551724, "mmlu_eval_accuracy_high_school_microeconomics": 0.23076923076923078, "mmlu_eval_accuracy_high_school_physics": 0.29411764705882354, "mmlu_eval_accuracy_high_school_psychology": 0.6666666666666666, "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, "mmlu_eval_accuracy_high_school_us_history": 0.5, "mmlu_eval_accuracy_high_school_world_history": 0.46153846153846156, "mmlu_eval_accuracy_human_aging": 0.6521739130434783, "mmlu_eval_accuracy_human_sexuality": 0.4166666666666667, "mmlu_eval_accuracy_international_law": 0.6923076923076923, "mmlu_eval_accuracy_jurisprudence": 0.45454545454545453, "mmlu_eval_accuracy_logical_fallacies": 0.5, "mmlu_eval_accuracy_machine_learning": 0.09090909090909091, "mmlu_eval_accuracy_management": 0.5454545454545454, "mmlu_eval_accuracy_marketing": 0.76, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.5813953488372093, "mmlu_eval_accuracy_moral_disputes": 0.42105263157894735, "mmlu_eval_accuracy_moral_scenarios": 0.25, "mmlu_eval_accuracy_nutrition": 0.48484848484848486, "mmlu_eval_accuracy_philosophy": 0.47058823529411764, "mmlu_eval_accuracy_prehistory": 0.37142857142857144, "mmlu_eval_accuracy_professional_accounting": 0.3870967741935484, "mmlu_eval_accuracy_professional_law": 0.3, "mmlu_eval_accuracy_professional_medicine": 0.3870967741935484, "mmlu_eval_accuracy_professional_psychology": 0.3333333333333333, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.4444444444444444, "mmlu_eval_accuracy_sociology": 0.5, "mmlu_eval_accuracy_us_foreign_policy": 0.6363636363636364, "mmlu_eval_accuracy_virology": 0.2777777777777778, "mmlu_eval_accuracy_world_religions": 0.5789473684210527, "mmlu_loss": 1.0690680153030419, "step": 5423 }, { "epoch": 1.702381419435835, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.8613, "step": 5424 }, { "epoch": 1.7026952803169995, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 2.518, "step": 5425 }, { "epoch": 1.703009141198164, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.6108, "step": 5426 }, { "epoch": 1.7033230020793284, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 1.6537, "step": 5427 }, { "epoch": 1.7036368629604928, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.6812, "step": 5428 }, { "epoch": 1.7039507238416571, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 1.6862, "step": 5429 }, { "epoch": 1.7042645847228215, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.8144, "step": 5430 }, { "epoch": 1.704578445603986, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 2.3003, "step": 5431 }, { "epoch": 1.7048923064851504, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 1.868, "step": 5432 }, { "epoch": 1.705206167366315, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 2.5762, "step": 5433 }, { "epoch": 1.7055200282474794, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 2.1479, "step": 5434 }, { "epoch": 1.7058338891286438, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 1.8356, "step": 5435 }, { "epoch": 1.7061477500098081, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 2.7924, "step": 5436 }, { "epoch": 1.7064616108909725, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 1.6025, "step": 5437 }, { "epoch": 1.7067754717721368, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.3645, "step": 5438 }, { "epoch": 1.7070893326533014, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.4154, "step": 5439 }, { "epoch": 1.707403193534466, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.3476, "step": 5440 }, { "epoch": 1.7077170544156304, "grad_norm": 0.2119140625, "learning_rate": 0.0002, "loss": 1.5175, "step": 5441 }, { "epoch": 1.7080309152967947, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.4335, "step": 5442 }, { "epoch": 1.708344776177959, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.4175, "step": 5443 }, { "epoch": 1.7086586370591235, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.4136, "step": 5444 }, { "epoch": 1.7089724979402878, "grad_norm": 0.208984375, "learning_rate": 0.0002, "loss": 1.4126, "step": 5445 }, { "epoch": 1.7092863588214524, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.3386, "step": 5446 }, { "epoch": 1.7096002197026168, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.2632, "step": 5447 }, { "epoch": 1.7099140805837814, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.5089, "step": 5448 }, { "epoch": 1.7102279414649457, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.294, "step": 5449 }, { "epoch": 1.71054180234611, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.2516, "step": 5450 }, { "epoch": 1.7108556632272744, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.2478, "step": 5451 }, { "epoch": 1.7111695241084388, "grad_norm": 0.1982421875, "learning_rate": 0.0002, "loss": 1.2558, "step": 5452 }, { "epoch": 1.7114833849896034, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.4596, "step": 5453 }, { "epoch": 1.7117972458707678, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.5655, "step": 5454 }, { "epoch": 1.7121111067519323, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.5339, "step": 5455 }, { "epoch": 1.7124249676330967, "grad_norm": 0.2021484375, "learning_rate": 0.0002, "loss": 1.2464, "step": 5456 }, { "epoch": 1.712738828514261, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 1.5458, "step": 5457 }, { "epoch": 1.7130526893954254, "grad_norm": 0.240234375, "learning_rate": 0.0002, "loss": 1.2654, "step": 5458 }, { "epoch": 1.7133665502765898, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 1.296, "step": 5459 }, { "epoch": 1.7136804111577544, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.342, "step": 5460 }, { "epoch": 1.7139942720389187, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.6868, "step": 5461 }, { "epoch": 1.7143081329200833, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 1.3321, "step": 5462 }, { "epoch": 1.7146219938012477, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 1.6589, "step": 5463 }, { "epoch": 1.714935854682412, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 1.5774, "step": 5464 }, { "epoch": 1.7152497155635764, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 1.8594, "step": 5465 }, { "epoch": 1.7155635764447408, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.6811, "step": 5466 }, { "epoch": 1.7158774373259051, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.6543, "step": 5467 }, { "epoch": 1.7161912982070697, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 1.7647, "step": 5468 }, { "epoch": 1.716505159088234, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.9218, "step": 5469 }, { "epoch": 1.7168190199693987, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 1.9351, "step": 5470 }, { "epoch": 1.717132880850563, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 1.817, "step": 5471 }, { "epoch": 1.7174467417317274, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 2.0335, "step": 5472 }, { "epoch": 1.7177606026128918, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 2.0642, "step": 5473 }, { "epoch": 1.7180744634940561, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 1.6929, "step": 5474 }, { "epoch": 1.7183883243752207, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 2.0268, "step": 5475 }, { "epoch": 1.718702185256385, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 2.1018, "step": 5476 }, { "epoch": 1.7190160461375497, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 1.7464, "step": 5477 }, { "epoch": 1.719329907018714, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 1.5394, "step": 5478 }, { "epoch": 1.7196437678998784, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 2.3166, "step": 5479 }, { "epoch": 1.7199576287810427, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 2.2199, "step": 5480 }, { "epoch": 1.720271489662207, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 2.2483, "step": 5481 }, { "epoch": 1.7205853505433717, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 1.9631, "step": 5482 }, { "epoch": 1.720899211424536, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 1.9375, "step": 5483 }, { "epoch": 1.7212130723057006, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 1.6763, "step": 5484 }, { "epoch": 1.721526933186865, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 2.4518, "step": 5485 }, { "epoch": 1.7218407940680294, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 2.7767, "step": 5486 }, { "epoch": 1.7221546549491937, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5217, "step": 5487 }, { "epoch": 1.722468515830358, "grad_norm": 0.12890625, "learning_rate": 0.0002, "loss": 1.3144, "step": 5488 }, { "epoch": 1.7227823767115225, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.2063, "step": 5489 }, { "epoch": 1.723096237592687, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.4158, "step": 5490 }, { "epoch": 1.7234100984738516, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.3824, "step": 5491 }, { "epoch": 1.723723959355016, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.5642, "step": 5492 }, { "epoch": 1.7240378202361804, "grad_norm": 0.185546875, "learning_rate": 0.0002, "loss": 1.2813, "step": 5493 }, { "epoch": 1.7243516811173447, "grad_norm": 0.19140625, "learning_rate": 0.0002, "loss": 1.3315, "step": 5494 }, { "epoch": 1.724665541998509, "grad_norm": 0.2236328125, "learning_rate": 0.0002, "loss": 1.3186, "step": 5495 }, { "epoch": 1.7249794028796734, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.2724, "step": 5496 }, { "epoch": 1.725293263760838, "grad_norm": 0.2060546875, "learning_rate": 0.0002, "loss": 1.2774, "step": 5497 }, { "epoch": 1.7256071246420024, "grad_norm": 0.1982421875, "learning_rate": 0.0002, "loss": 1.4014, "step": 5498 }, { "epoch": 1.725920985523167, "grad_norm": 0.205078125, "learning_rate": 0.0002, "loss": 1.4198, "step": 5499 }, { "epoch": 1.7262348464043313, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.4122, "step": 5500 }, { "epoch": 1.7265487072854957, "grad_norm": 0.24609375, "learning_rate": 0.0002, "loss": 1.4289, "step": 5501 }, { "epoch": 1.72686256816666, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.235, "step": 5502 }, { "epoch": 1.7271764290478244, "grad_norm": 0.24609375, "learning_rate": 0.0002, "loss": 1.2452, "step": 5503 }, { "epoch": 1.727490289928989, "grad_norm": 0.232421875, "learning_rate": 0.0002, "loss": 1.2983, "step": 5504 }, { "epoch": 1.7278041508101534, "grad_norm": 0.2216796875, "learning_rate": 0.0002, "loss": 1.3754, "step": 5505 }, { "epoch": 1.728118011691318, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.5459, "step": 5506 }, { "epoch": 1.7284318725724823, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.4628, "step": 5507 }, { "epoch": 1.7287457334536467, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.0722, "step": 5508 }, { "epoch": 1.729059594334811, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.4004, "step": 5509 }, { "epoch": 1.7293734552159754, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.4633, "step": 5510 }, { "epoch": 1.72968731609714, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.489, "step": 5511 }, { "epoch": 1.7300011769783044, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 1.352, "step": 5512 }, { "epoch": 1.730315037859469, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.7752, "step": 5513 }, { "epoch": 1.7306288987406333, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 1.6925, "step": 5514 }, { "epoch": 1.7309427596217977, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 1.4145, "step": 5515 }, { "epoch": 1.731256620502962, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.7095, "step": 5516 }, { "epoch": 1.7315704813841264, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 1.5195, "step": 5517 }, { "epoch": 1.7318843422652908, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 1.5376, "step": 5518 }, { "epoch": 1.7321982031464553, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 2.0259, "step": 5519 }, { "epoch": 1.7325120640276197, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 2.0046, "step": 5520 }, { "epoch": 1.7328259249087843, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 1.8622, "step": 5521 }, { "epoch": 1.7331397857899487, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 1.7366, "step": 5522 }, { "epoch": 1.733453646671113, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.939, "step": 5523 }, { "epoch": 1.7337675075522774, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 1.8009, "step": 5524 }, { "epoch": 1.7340813684334417, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 2.1492, "step": 5525 }, { "epoch": 1.7343952293146063, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 1.9199, "step": 5526 }, { "epoch": 1.7347090901957707, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 2.056, "step": 5527 }, { "epoch": 1.7350229510769353, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 2.013, "step": 5528 }, { "epoch": 1.7353368119580996, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 1.7799, "step": 5529 }, { "epoch": 1.735650672839264, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 2.2011, "step": 5530 }, { "epoch": 1.7359645337204284, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 2.0116, "step": 5531 }, { "epoch": 1.7362783946015927, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 2.0485, "step": 5532 }, { "epoch": 1.7365922554827573, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 1.5143, "step": 5533 }, { "epoch": 1.7369061163639217, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.9354, "step": 5534 }, { "epoch": 1.7372199772450863, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 2.1758, "step": 5535 }, { "epoch": 1.7375338381262506, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 2.7749, "step": 5536 }, { "epoch": 1.737847699007415, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.5269, "step": 5537 }, { "epoch": 1.7381615598885793, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.383, "step": 5538 }, { "epoch": 1.7384754207697437, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.3237, "step": 5539 }, { "epoch": 1.738789281650908, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.2926, "step": 5540 }, { "epoch": 1.7391031425320727, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.3281, "step": 5541 }, { "epoch": 1.7394170034132372, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.3902, "step": 5542 }, { "epoch": 1.7397308642944016, "grad_norm": 0.19140625, "learning_rate": 0.0002, "loss": 1.3967, "step": 5543 }, { "epoch": 1.740044725175566, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.221, "step": 5544 }, { "epoch": 1.7403585860567303, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.1903, "step": 5545 }, { "epoch": 1.7406724469378947, "grad_norm": 0.197265625, "learning_rate": 0.0002, "loss": 1.2487, "step": 5546 }, { "epoch": 1.740986307819059, "grad_norm": 0.19921875, "learning_rate": 0.0002, "loss": 1.4515, "step": 5547 }, { "epoch": 1.7413001687002236, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.1023, "step": 5548 }, { "epoch": 1.741614029581388, "grad_norm": 0.212890625, "learning_rate": 0.0002, "loss": 1.3426, "step": 5549 }, { "epoch": 1.7419278904625526, "grad_norm": 0.2314453125, "learning_rate": 0.0002, "loss": 1.3032, "step": 5550 }, { "epoch": 1.742241751343717, "grad_norm": 0.1845703125, "learning_rate": 0.0002, "loss": 1.3823, "step": 5551 }, { "epoch": 1.7425556122248813, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.3341, "step": 5552 }, { "epoch": 1.7428694731060457, "grad_norm": 0.2041015625, "learning_rate": 0.0002, "loss": 1.3123, "step": 5553 }, { "epoch": 1.74318333398721, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.4137, "step": 5554 }, { "epoch": 1.7434971948683746, "grad_norm": 0.2060546875, "learning_rate": 0.0002, "loss": 1.2661, "step": 5555 }, { "epoch": 1.743811055749539, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.3533, "step": 5556 }, { "epoch": 1.7441249166307036, "grad_norm": 0.21484375, "learning_rate": 0.0002, "loss": 1.2212, "step": 5557 }, { "epoch": 1.744438777511868, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.6135, "step": 5558 }, { "epoch": 1.7447526383930323, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.4766, "step": 5559 }, { "epoch": 1.7450664992741967, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.4571, "step": 5560 }, { "epoch": 1.745380360155361, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.3557, "step": 5561 }, { "epoch": 1.7456942210365256, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 1.6698, "step": 5562 }, { "epoch": 1.74600808191769, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 1.8339, "step": 5563 }, { "epoch": 1.7463219427988546, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 2.0184, "step": 5564 }, { "epoch": 1.746635803680019, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 1.9101, "step": 5565 }, { "epoch": 1.7469496645611833, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 1.8805, "step": 5566 }, { "epoch": 1.7472635254423476, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 1.7106, "step": 5567 }, { "epoch": 1.747577386323512, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.6501, "step": 5568 }, { "epoch": 1.7478912472046764, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 1.7959, "step": 5569 }, { "epoch": 1.748205108085841, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 2.0345, "step": 5570 }, { "epoch": 1.7485189689670055, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 1.6343, "step": 5571 }, { "epoch": 1.74883282984817, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.6024, "step": 5572 }, { "epoch": 1.7491466907293343, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 1.9988, "step": 5573 }, { "epoch": 1.7494605516104986, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.8459, "step": 5574 }, { "epoch": 1.749774412491663, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 2.3739, "step": 5575 }, { "epoch": 1.7500882733728274, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.9286, "step": 5576 }, { "epoch": 1.750402134253992, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.9319, "step": 5577 }, { "epoch": 1.7507159951351563, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.9208, "step": 5578 }, { "epoch": 1.7510298560163209, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 1.7093, "step": 5579 }, { "epoch": 1.7513437168974852, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 2.2733, "step": 5580 }, { "epoch": 1.7516575777786496, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 1.5373, "step": 5581 }, { "epoch": 1.751971438659814, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 1.7424, "step": 5582 }, { "epoch": 1.7522852995409783, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.4287, "step": 5583 }, { "epoch": 1.752599160422143, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 1.7389, "step": 5584 }, { "epoch": 1.7529130213033073, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 2.3242, "step": 5585 }, { "epoch": 1.7532268821844719, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 2.4223, "step": 5586 }, { "epoch": 1.7535407430656362, "grad_norm": 0.189453125, "learning_rate": 0.0002, "loss": 1.4838, "step": 5587 }, { "epoch": 1.7538546039468006, "grad_norm": 0.12890625, "learning_rate": 0.0002, "loss": 1.2177, "step": 5588 }, { "epoch": 1.754168464827965, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.4128, "step": 5589 }, { "epoch": 1.7544823257091293, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.3305, "step": 5590 }, { "epoch": 1.754796186590294, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.3185, "step": 5591 }, { "epoch": 1.7551100474714583, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.2726, "step": 5592 }, { "epoch": 1.7554239083526229, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.2291, "step": 5593 }, { "epoch": 1.7557377692337872, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.4381, "step": 5594 }, { "epoch": 1.7560516301149516, "grad_norm": 0.19140625, "learning_rate": 0.0002, "loss": 1.4414, "step": 5595 }, { "epoch": 1.756365490996116, "grad_norm": 0.1982421875, "learning_rate": 0.0002, "loss": 1.5047, "step": 5596 }, { "epoch": 1.7566793518772803, "grad_norm": 0.2099609375, "learning_rate": 0.0002, "loss": 1.4359, "step": 5597 }, { "epoch": 1.7569932127584447, "grad_norm": 0.2001953125, "learning_rate": 0.0002, "loss": 1.5671, "step": 5598 }, { "epoch": 1.7573070736396093, "grad_norm": 0.1953125, "learning_rate": 0.0002, "loss": 1.3196, "step": 5599 }, { "epoch": 1.7576209345207736, "grad_norm": 0.185546875, "learning_rate": 0.0002, "loss": 1.1978, "step": 5600 }, { "epoch": 1.7579347954019382, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.4638, "step": 5601 }, { "epoch": 1.7582486562831026, "grad_norm": 0.2275390625, "learning_rate": 0.0002, "loss": 1.3435, "step": 5602 }, { "epoch": 1.758562517164267, "grad_norm": 0.2158203125, "learning_rate": 0.0002, "loss": 1.3371, "step": 5603 }, { "epoch": 1.7588763780454313, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 1.3211, "step": 5604 }, { "epoch": 1.7591902389265957, "grad_norm": 0.2451171875, "learning_rate": 0.0002, "loss": 1.3467, "step": 5605 }, { "epoch": 1.7595040998077602, "grad_norm": 0.228515625, "learning_rate": 0.0002, "loss": 1.3954, "step": 5606 }, { "epoch": 1.7598179606889246, "grad_norm": 0.205078125, "learning_rate": 0.0002, "loss": 1.1211, "step": 5607 }, { "epoch": 1.7601318215700892, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.5554, "step": 5608 }, { "epoch": 1.7604456824512535, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 1.6266, "step": 5609 }, { "epoch": 1.760759543332418, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.5075, "step": 5610 }, { "epoch": 1.760759543332418, "eval_loss": 1.781874656677246, "eval_runtime": 122.787, "eval_samples_per_second": 8.144, "eval_steps_per_second": 8.144, "step": 5610 }, { "epoch": 1.760759543332418, "mmlu_eval_accuracy": 0.4155586197243395, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.6363636363636364, "mmlu_eval_accuracy_clinical_knowledge": 0.3448275862068966, "mmlu_eval_accuracy_college_biology": 0.1875, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454, "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.2727272727272727, "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.375, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.2857142857142857, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.28125, "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365, "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778, "mmlu_eval_accuracy_high_school_european_history": 0.6666666666666666, "mmlu_eval_accuracy_high_school_geography": 0.5909090909090909, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5238095238095238, "mmlu_eval_accuracy_high_school_macroeconomics": 0.37209302325581395, "mmlu_eval_accuracy_high_school_mathematics": 0.2413793103448276, "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, "mmlu_eval_accuracy_high_school_physics": 0.29411764705882354, "mmlu_eval_accuracy_high_school_psychology": 0.5666666666666667, "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, "mmlu_eval_accuracy_high_school_us_history": 0.5, "mmlu_eval_accuracy_high_school_world_history": 0.5384615384615384, "mmlu_eval_accuracy_human_aging": 0.6956521739130435, "mmlu_eval_accuracy_human_sexuality": 0.5, "mmlu_eval_accuracy_international_law": 0.6923076923076923, "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, "mmlu_eval_accuracy_logical_fallacies": 0.5, "mmlu_eval_accuracy_machine_learning": 0.09090909090909091, "mmlu_eval_accuracy_management": 0.5454545454545454, "mmlu_eval_accuracy_marketing": 0.76, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.5697674418604651, "mmlu_eval_accuracy_moral_disputes": 0.39473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.23, "mmlu_eval_accuracy_nutrition": 0.45454545454545453, "mmlu_eval_accuracy_philosophy": 0.38235294117647056, "mmlu_eval_accuracy_prehistory": 0.3142857142857143, "mmlu_eval_accuracy_professional_accounting": 0.3870967741935484, "mmlu_eval_accuracy_professional_law": 0.32941176470588235, "mmlu_eval_accuracy_professional_medicine": 0.41935483870967744, "mmlu_eval_accuracy_professional_psychology": 0.3333333333333333, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.4444444444444444, "mmlu_eval_accuracy_sociology": 0.5454545454545454, "mmlu_eval_accuracy_us_foreign_policy": 0.6363636363636364, "mmlu_eval_accuracy_virology": 0.3888888888888889, "mmlu_eval_accuracy_world_religions": 0.5789473684210527, "mmlu_loss": 1.0053661084210148, "step": 5610 }, { "epoch": 1.7610734042135823, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.615, "step": 5611 }, { "epoch": 1.7613872650947466, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.3273, "step": 5612 }, { "epoch": 1.7617011259759112, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.3637, "step": 5613 }, { "epoch": 1.7620149868570756, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 1.7726, "step": 5614 }, { "epoch": 1.7623288477382402, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 1.8462, "step": 5615 }, { "epoch": 1.7626427086194045, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 1.7213, "step": 5616 }, { "epoch": 1.762956569500569, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 1.8882, "step": 5617 }, { "epoch": 1.7632704303817333, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 1.8901, "step": 5618 }, { "epoch": 1.7635842912628976, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 2.0592, "step": 5619 }, { "epoch": 1.763898152144062, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 2.1989, "step": 5620 }, { "epoch": 1.7642120130252266, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.9823, "step": 5621 }, { "epoch": 1.7645258739063912, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 2.0021, "step": 5622 }, { "epoch": 1.7648397347875555, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 1.9681, "step": 5623 }, { "epoch": 1.7651535956687199, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 1.6639, "step": 5624 }, { "epoch": 1.7654674565498842, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 1.869, "step": 5625 }, { "epoch": 1.7657813174310486, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 1.9022, "step": 5626 }, { "epoch": 1.766095178312213, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.6985, "step": 5627 }, { "epoch": 1.7664090391933776, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 2.3216, "step": 5628 }, { "epoch": 1.766722900074542, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 1.9844, "step": 5629 }, { "epoch": 1.7670367609557065, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 1.9624, "step": 5630 }, { "epoch": 1.7673506218368709, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 2.2566, "step": 5631 }, { "epoch": 1.7676644827180352, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 2.3533, "step": 5632 }, { "epoch": 1.7679783435991996, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 1.6872, "step": 5633 }, { "epoch": 1.768292204480364, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.8253, "step": 5634 }, { "epoch": 1.7686060653615285, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 1.9717, "step": 5635 }, { "epoch": 1.768919926242693, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 2.7643, "step": 5636 }, { "epoch": 1.7692337871238575, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.428, "step": 5637 }, { "epoch": 1.7695476480050218, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.3254, "step": 5638 }, { "epoch": 1.7698615088861862, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.3439, "step": 5639 }, { "epoch": 1.7701753697673506, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.3656, "step": 5640 }, { "epoch": 1.770489230648515, "grad_norm": 0.2099609375, "learning_rate": 0.0002, "loss": 1.4807, "step": 5641 }, { "epoch": 1.7708030915296795, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.362, "step": 5642 }, { "epoch": 1.7711169524108439, "grad_norm": 0.189453125, "learning_rate": 0.0002, "loss": 1.2893, "step": 5643 }, { "epoch": 1.7714308132920085, "grad_norm": 0.2041015625, "learning_rate": 0.0002, "loss": 1.459, "step": 5644 }, { "epoch": 1.7717446741731728, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.3325, "step": 5645 }, { "epoch": 1.7720585350543372, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.4091, "step": 5646 }, { "epoch": 1.7723723959355016, "grad_norm": 0.1884765625, "learning_rate": 0.0002, "loss": 1.3117, "step": 5647 }, { "epoch": 1.772686256816666, "grad_norm": 0.2451171875, "learning_rate": 0.0002, "loss": 1.3572, "step": 5648 }, { "epoch": 1.7730001176978303, "grad_norm": 0.212890625, "learning_rate": 0.0002, "loss": 1.4819, "step": 5649 }, { "epoch": 1.7733139785789949, "grad_norm": 0.2021484375, "learning_rate": 0.0002, "loss": 1.3822, "step": 5650 }, { "epoch": 1.7736278394601594, "grad_norm": 0.205078125, "learning_rate": 0.0002, "loss": 1.3218, "step": 5651 }, { "epoch": 1.7739417003413238, "grad_norm": 0.197265625, "learning_rate": 0.0002, "loss": 1.3498, "step": 5652 }, { "epoch": 1.7742555612224882, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.3485, "step": 5653 }, { "epoch": 1.7745694221036525, "grad_norm": 0.22265625, "learning_rate": 0.0002, "loss": 1.3077, "step": 5654 }, { "epoch": 1.774883282984817, "grad_norm": 0.2041015625, "learning_rate": 0.0002, "loss": 1.3087, "step": 5655 }, { "epoch": 1.7751971438659813, "grad_norm": 0.224609375, "learning_rate": 0.0002, "loss": 1.3835, "step": 5656 }, { "epoch": 1.7755110047471458, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.4421, "step": 5657 }, { "epoch": 1.7758248656283102, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.367, "step": 5658 }, { "epoch": 1.7761387265094748, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.5125, "step": 5659 }, { "epoch": 1.7764525873906392, "grad_norm": 0.2373046875, "learning_rate": 0.0002, "loss": 1.2472, "step": 5660 }, { "epoch": 1.7767664482718035, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.3469, "step": 5661 }, { "epoch": 1.7770803091529679, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 1.6867, "step": 5662 }, { "epoch": 1.7773941700341322, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 2.0652, "step": 5663 }, { "epoch": 1.7777080309152968, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 1.7862, "step": 5664 }, { "epoch": 1.7780218917964612, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 1.673, "step": 5665 }, { "epoch": 1.7783357526776258, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 1.5529, "step": 5666 }, { "epoch": 1.7786496135587901, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 1.889, "step": 5667 }, { "epoch": 1.7789634744399545, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 1.9459, "step": 5668 }, { "epoch": 1.7792773353211189, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 1.9317, "step": 5669 }, { "epoch": 1.7795911962022832, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 1.8915, "step": 5670 }, { "epoch": 1.7799050570834478, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.8185, "step": 5671 }, { "epoch": 1.7802189179646122, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 1.9135, "step": 5672 }, { "epoch": 1.7805327788457768, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 1.6733, "step": 5673 }, { "epoch": 1.7808466397269411, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 2.1443, "step": 5674 }, { "epoch": 1.7811605006081055, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.6653, "step": 5675 }, { "epoch": 1.7814743614892699, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 1.7433, "step": 5676 }, { "epoch": 1.7817882223704342, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.5781, "step": 5677 }, { "epoch": 1.7821020832515986, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 2.1697, "step": 5678 }, { "epoch": 1.7824159441327632, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 2.0052, "step": 5679 }, { "epoch": 1.7827298050139275, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 1.8674, "step": 5680 }, { "epoch": 1.7830436658950921, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 1.6011, "step": 5681 }, { "epoch": 1.7833575267762565, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 1.8661, "step": 5682 }, { "epoch": 1.7836713876574208, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 2.1494, "step": 5683 }, { "epoch": 1.7839852485385852, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 1.9949, "step": 5684 }, { "epoch": 1.7842991094197496, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.9779, "step": 5685 }, { "epoch": 1.7846129703009141, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 2.2549, "step": 5686 }, { "epoch": 1.7849268311820785, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.7224, "step": 5687 }, { "epoch": 1.785240692063243, "grad_norm": 0.12109375, "learning_rate": 0.0002, "loss": 1.0861, "step": 5688 }, { "epoch": 1.7855545529444075, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.3832, "step": 5689 }, { "epoch": 1.7858684138255718, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.3256, "step": 5690 }, { "epoch": 1.7861822747067362, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.508, "step": 5691 }, { "epoch": 1.7864961355879005, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.3166, "step": 5692 }, { "epoch": 1.7868099964690651, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.2937, "step": 5693 }, { "epoch": 1.7871238573502295, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.3582, "step": 5694 }, { "epoch": 1.787437718231394, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.3818, "step": 5695 }, { "epoch": 1.7877515791125584, "grad_norm": 0.1923828125, "learning_rate": 0.0002, "loss": 1.5635, "step": 5696 }, { "epoch": 1.7880654399937228, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.3364, "step": 5697 }, { "epoch": 1.7883793008748872, "grad_norm": 0.193359375, "learning_rate": 0.0002, "loss": 1.3078, "step": 5698 }, { "epoch": 1.7886931617560515, "grad_norm": 0.2041015625, "learning_rate": 0.0002, "loss": 1.4968, "step": 5699 }, { "epoch": 1.789007022637216, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.3104, "step": 5700 }, { "epoch": 1.7893208835183805, "grad_norm": 0.1875, "learning_rate": 0.0002, "loss": 1.3153, "step": 5701 }, { "epoch": 1.789634744399545, "grad_norm": 0.2197265625, "learning_rate": 0.0002, "loss": 1.3626, "step": 5702 }, { "epoch": 1.7899486052807094, "grad_norm": 0.22265625, "learning_rate": 0.0002, "loss": 1.3026, "step": 5703 }, { "epoch": 1.7902624661618738, "grad_norm": 0.21484375, "learning_rate": 0.0002, "loss": 1.2832, "step": 5704 }, { "epoch": 1.7905763270430382, "grad_norm": 0.2236328125, "learning_rate": 0.0002, "loss": 1.2591, "step": 5705 }, { "epoch": 1.7908901879242025, "grad_norm": 0.2333984375, "learning_rate": 0.0002, "loss": 1.3971, "step": 5706 }, { "epoch": 1.7912040488053669, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.5718, "step": 5707 }, { "epoch": 1.7915179096865315, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.195, "step": 5708 }, { "epoch": 1.7918317705676958, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.2835, "step": 5709 }, { "epoch": 1.7921456314488604, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 1.4917, "step": 5710 }, { "epoch": 1.7924594923300248, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.5119, "step": 5711 }, { "epoch": 1.7927733532111891, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.7028, "step": 5712 }, { "epoch": 1.7930872140923535, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 1.6387, "step": 5713 }, { "epoch": 1.7934010749735179, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 1.6766, "step": 5714 }, { "epoch": 1.7937149358546824, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 1.3949, "step": 5715 }, { "epoch": 1.7940287967358468, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 1.7873, "step": 5716 }, { "epoch": 1.7943426576170114, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 1.6646, "step": 5717 }, { "epoch": 1.7946565184981758, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 1.5148, "step": 5718 }, { "epoch": 1.7949703793793401, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 1.6509, "step": 5719 }, { "epoch": 1.7952842402605045, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 2.1718, "step": 5720 }, { "epoch": 1.7955981011416688, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.6369, "step": 5721 }, { "epoch": 1.7959119620228334, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 1.6852, "step": 5722 }, { "epoch": 1.7962258229039978, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 1.8289, "step": 5723 }, { "epoch": 1.7965396837851624, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 1.9634, "step": 5724 }, { "epoch": 1.7968535446663267, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 1.9003, "step": 5725 }, { "epoch": 1.797167405547491, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 2.0595, "step": 5726 }, { "epoch": 1.7974812664286555, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.8105, "step": 5727 }, { "epoch": 1.7977951273098198, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 1.857, "step": 5728 }, { "epoch": 1.7981089881909842, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.4325, "step": 5729 }, { "epoch": 1.7984228490721488, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 2.6919, "step": 5730 }, { "epoch": 1.7987367099533131, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 1.9036, "step": 5731 }, { "epoch": 1.7990505708344777, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 1.8962, "step": 5732 }, { "epoch": 1.799364431715642, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.7343, "step": 5733 }, { "epoch": 1.7996782925968065, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.793, "step": 5734 }, { "epoch": 1.7999921534779708, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 1.994, "step": 5735 }, { "epoch": 1.8003060143591352, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 2.2049, "step": 5736 }, { "epoch": 1.8006198752402998, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.4931, "step": 5737 }, { "epoch": 1.8009337361214641, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.3133, "step": 5738 }, { "epoch": 1.8012475970026287, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.2941, "step": 5739 }, { "epoch": 1.801561457883793, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.3704, "step": 5740 }, { "epoch": 1.8018753187649574, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.5015, "step": 5741 }, { "epoch": 1.8021891796461218, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.4961, "step": 5742 }, { "epoch": 1.8025030405272862, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.4393, "step": 5743 }, { "epoch": 1.8028169014084507, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.3249, "step": 5744 }, { "epoch": 1.803130762289615, "grad_norm": 0.1953125, "learning_rate": 0.0002, "loss": 1.3936, "step": 5745 }, { "epoch": 1.8034446231707797, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.1797, "step": 5746 }, { "epoch": 1.803758484051944, "grad_norm": 0.2177734375, "learning_rate": 0.0002, "loss": 1.3464, "step": 5747 }, { "epoch": 1.8040723449331084, "grad_norm": 0.19921875, "learning_rate": 0.0002, "loss": 1.3781, "step": 5748 }, { "epoch": 1.8043862058142728, "grad_norm": 0.189453125, "learning_rate": 0.0002, "loss": 1.3861, "step": 5749 }, { "epoch": 1.8047000666954371, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 1.4047, "step": 5750 }, { "epoch": 1.8050139275766015, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 1.4029, "step": 5751 }, { "epoch": 1.805327788457766, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.3544, "step": 5752 }, { "epoch": 1.8056416493389307, "grad_norm": 0.2373046875, "learning_rate": 0.0002, "loss": 1.517, "step": 5753 }, { "epoch": 1.805955510220095, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 1.7595, "step": 5754 }, { "epoch": 1.8062693711012594, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 1.2078, "step": 5755 }, { "epoch": 1.8065832319824238, "grad_norm": 0.224609375, "learning_rate": 0.0002, "loss": 1.5419, "step": 5756 }, { "epoch": 1.8068970928635881, "grad_norm": 0.21484375, "learning_rate": 0.0002, "loss": 1.236, "step": 5757 }, { "epoch": 1.8072109537447525, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.3237, "step": 5758 }, { "epoch": 1.807524814625917, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 1.3794, "step": 5759 }, { "epoch": 1.8078386755070814, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.528, "step": 5760 }, { "epoch": 1.808152536388246, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.4909, "step": 5761 }, { "epoch": 1.8084663972694104, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 1.7566, "step": 5762 }, { "epoch": 1.8087802581505747, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 1.8292, "step": 5763 }, { "epoch": 1.8090941190317391, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 1.6946, "step": 5764 }, { "epoch": 1.8094079799129035, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.8144, "step": 5765 }, { "epoch": 1.809721840794068, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 2.0289, "step": 5766 }, { "epoch": 1.8100357016752324, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 1.7732, "step": 5767 }, { "epoch": 1.810349562556397, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.6602, "step": 5768 }, { "epoch": 1.8106634234375614, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 2.0523, "step": 5769 }, { "epoch": 1.8109772843187257, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 1.8063, "step": 5770 }, { "epoch": 1.81129114519989, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 1.9525, "step": 5771 }, { "epoch": 1.8116050060810545, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 1.7752, "step": 5772 }, { "epoch": 1.811918866962219, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 1.8529, "step": 5773 }, { "epoch": 1.8122327278433834, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 1.8923, "step": 5774 }, { "epoch": 1.812546588724548, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 1.8144, "step": 5775 }, { "epoch": 1.8128604496057124, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 2.1445, "step": 5776 }, { "epoch": 1.8131743104868767, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 1.9996, "step": 5777 }, { "epoch": 1.813488171368041, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 1.8569, "step": 5778 }, { "epoch": 1.8138020322492054, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 2.1226, "step": 5779 }, { "epoch": 1.8141158931303698, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 2.2404, "step": 5780 }, { "epoch": 1.8144297540115344, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 1.7292, "step": 5781 }, { "epoch": 1.814743614892699, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.7762, "step": 5782 }, { "epoch": 1.8150574757738633, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.6273, "step": 5783 }, { "epoch": 1.8153713366550277, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 1.7332, "step": 5784 }, { "epoch": 1.815685197536192, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 1.7667, "step": 5785 }, { "epoch": 1.8159990584173564, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 2.1648, "step": 5786 }, { "epoch": 1.8163129192985208, "grad_norm": 0.232421875, "learning_rate": 0.0002, "loss": 1.4316, "step": 5787 }, { "epoch": 1.8166267801796854, "grad_norm": 0.1318359375, "learning_rate": 0.0002, "loss": 1.1858, "step": 5788 }, { "epoch": 1.8169406410608497, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.4101, "step": 5789 }, { "epoch": 1.8172545019420143, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5071, "step": 5790 }, { "epoch": 1.8175683628231787, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.2818, "step": 5791 }, { "epoch": 1.817882223704343, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.3911, "step": 5792 }, { "epoch": 1.8181960845855074, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.177, "step": 5793 }, { "epoch": 1.8185099454666718, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.3634, "step": 5794 }, { "epoch": 1.8188238063478364, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.3598, "step": 5795 }, { "epoch": 1.8191376672290007, "grad_norm": 0.1845703125, "learning_rate": 0.0002, "loss": 1.3231, "step": 5796 }, { "epoch": 1.8194515281101653, "grad_norm": 0.2333984375, "learning_rate": 0.0002, "loss": 1.4335, "step": 5797 }, { "epoch": 1.8194515281101653, "eval_loss": 1.782687783241272, "eval_runtime": 123.7187, "eval_samples_per_second": 8.083, "eval_steps_per_second": 8.083, "step": 5797 }, { "epoch": 1.8194515281101653, "mmlu_eval_accuracy": 0.41312694620958684, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, "mmlu_eval_accuracy_clinical_knowledge": 0.4482758620689655, "mmlu_eval_accuracy_college_biology": 0.25, "mmlu_eval_accuracy_college_chemistry": 0.0, "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.2727272727272727, "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.375, "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, "mmlu_eval_accuracy_formal_logic": 0.35714285714285715, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.3125, "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365, "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556, "mmlu_eval_accuracy_high_school_european_history": 0.7222222222222222, "mmlu_eval_accuracy_high_school_geography": 0.6363636363636364, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5238095238095238, "mmlu_eval_accuracy_high_school_macroeconomics": 0.4186046511627907, "mmlu_eval_accuracy_high_school_mathematics": 0.2413793103448276, "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, "mmlu_eval_accuracy_high_school_psychology": 0.6333333333333333, "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, "mmlu_eval_accuracy_high_school_us_history": 0.45454545454545453, "mmlu_eval_accuracy_high_school_world_history": 0.4230769230769231, "mmlu_eval_accuracy_human_aging": 0.6521739130434783, "mmlu_eval_accuracy_human_sexuality": 0.4166666666666667, "mmlu_eval_accuracy_international_law": 0.6923076923076923, "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, "mmlu_eval_accuracy_logical_fallacies": 0.5, "mmlu_eval_accuracy_machine_learning": 0.09090909090909091, "mmlu_eval_accuracy_management": 0.5454545454545454, "mmlu_eval_accuracy_marketing": 0.72, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.5116279069767442, "mmlu_eval_accuracy_moral_disputes": 0.4473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.32, "mmlu_eval_accuracy_nutrition": 0.45454545454545453, "mmlu_eval_accuracy_philosophy": 0.4117647058823529, "mmlu_eval_accuracy_prehistory": 0.3142857142857143, "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, "mmlu_eval_accuracy_professional_law": 0.3058823529411765, "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, "mmlu_eval_accuracy_professional_psychology": 0.42028985507246375, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.48148148148148145, "mmlu_eval_accuracy_sociology": 0.4090909090909091, "mmlu_eval_accuracy_us_foreign_policy": 0.7272727272727273, "mmlu_eval_accuracy_virology": 0.3888888888888889, "mmlu_eval_accuracy_world_religions": 0.6842105263157895, "mmlu_loss": 1.114936444151643, "step": 5797 }, { "epoch": 1.8197653889913297, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.3649, "step": 5798 }, { "epoch": 1.820079249872494, "grad_norm": 0.185546875, "learning_rate": 0.0002, "loss": 1.288, "step": 5799 }, { "epoch": 1.8203931107536584, "grad_norm": 0.220703125, "learning_rate": 0.0002, "loss": 1.3097, "step": 5800 }, { "epoch": 1.8207069716348228, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 1.3494, "step": 5801 }, { "epoch": 1.8210208325159873, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 1.3934, "step": 5802 }, { "epoch": 1.8213346933971517, "grad_norm": 0.2177734375, "learning_rate": 0.0002, "loss": 1.3018, "step": 5803 }, { "epoch": 1.8216485542783163, "grad_norm": 0.2197265625, "learning_rate": 0.0002, "loss": 1.1474, "step": 5804 }, { "epoch": 1.8219624151594807, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.4874, "step": 5805 }, { "epoch": 1.822276276040645, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.2835, "step": 5806 }, { "epoch": 1.8225901369218094, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.4347, "step": 5807 }, { "epoch": 1.8229039978029737, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.4818, "step": 5808 }, { "epoch": 1.823217858684138, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.4124, "step": 5809 }, { "epoch": 1.8235317195653027, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.3921, "step": 5810 }, { "epoch": 1.823845580446467, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.56, "step": 5811 }, { "epoch": 1.8241594413276316, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 1.5578, "step": 5812 }, { "epoch": 1.824473302208796, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 1.551, "step": 5813 }, { "epoch": 1.8247871630899604, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 1.7328, "step": 5814 }, { "epoch": 1.8251010239711247, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 1.7849, "step": 5815 }, { "epoch": 1.825414884852289, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.6976, "step": 5816 }, { "epoch": 1.8257287457334537, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 2.0065, "step": 5817 }, { "epoch": 1.826042606614618, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 1.7382, "step": 5818 }, { "epoch": 1.8263564674957826, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 1.6435, "step": 5819 }, { "epoch": 1.826670328376947, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 2.4371, "step": 5820 }, { "epoch": 1.8269841892581113, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 1.9233, "step": 5821 }, { "epoch": 1.8272980501392757, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 1.8327, "step": 5822 }, { "epoch": 1.82761191102044, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 2.1253, "step": 5823 }, { "epoch": 1.8279257719016047, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.7072, "step": 5824 }, { "epoch": 1.828239632782769, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.7254, "step": 5825 }, { "epoch": 1.8285534936639336, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 2.0108, "step": 5826 }, { "epoch": 1.828867354545098, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 2.0992, "step": 5827 }, { "epoch": 1.8291812154262623, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 2.0665, "step": 5828 }, { "epoch": 1.8294950763074267, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.5506, "step": 5829 }, { "epoch": 1.829808937188591, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.9926, "step": 5830 }, { "epoch": 1.8301227980697554, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.9656, "step": 5831 }, { "epoch": 1.83043665895092, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.7434, "step": 5832 }, { "epoch": 1.8307505198320846, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.6967, "step": 5833 }, { "epoch": 1.831064380713249, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.7366, "step": 5834 }, { "epoch": 1.8313782415944133, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 1.6125, "step": 5835 }, { "epoch": 1.8316921024755777, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 2.7015, "step": 5836 }, { "epoch": 1.832005963356742, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.7805, "step": 5837 }, { "epoch": 1.8323198242379064, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.3908, "step": 5838 }, { "epoch": 1.832633685119071, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.4512, "step": 5839 }, { "epoch": 1.8329475460002354, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.287, "step": 5840 }, { "epoch": 1.8332614068814, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.6453, "step": 5841 }, { "epoch": 1.8335752677625643, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.2642, "step": 5842 }, { "epoch": 1.8338891286437287, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.3075, "step": 5843 }, { "epoch": 1.834202989524893, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.2219, "step": 5844 }, { "epoch": 1.8345168504060574, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 1.5635, "step": 5845 }, { "epoch": 1.834830711287222, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.3794, "step": 5846 }, { "epoch": 1.8351445721683863, "grad_norm": 0.203125, "learning_rate": 0.0002, "loss": 1.5997, "step": 5847 }, { "epoch": 1.835458433049551, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.362, "step": 5848 }, { "epoch": 1.8357722939307153, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.3858, "step": 5849 }, { "epoch": 1.8360861548118796, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.3161, "step": 5850 }, { "epoch": 1.836400015693044, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.4395, "step": 5851 }, { "epoch": 1.8367138765742084, "grad_norm": 0.1943359375, "learning_rate": 0.0002, "loss": 1.3102, "step": 5852 }, { "epoch": 1.837027737455373, "grad_norm": 0.197265625, "learning_rate": 0.0002, "loss": 1.3069, "step": 5853 }, { "epoch": 1.8373415983365373, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 1.4019, "step": 5854 }, { "epoch": 1.837655459217702, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 1.3886, "step": 5855 }, { "epoch": 1.8379693200988663, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.405, "step": 5856 }, { "epoch": 1.8382831809800306, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.4266, "step": 5857 }, { "epoch": 1.838597041861195, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.5111, "step": 5858 }, { "epoch": 1.8389109027423594, "grad_norm": 0.2412109375, "learning_rate": 0.0002, "loss": 1.378, "step": 5859 }, { "epoch": 1.8392247636235237, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.4416, "step": 5860 }, { "epoch": 1.8395386245046883, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 1.5043, "step": 5861 }, { "epoch": 1.8398524853858529, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.2472, "step": 5862 }, { "epoch": 1.8401663462670172, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.3677, "step": 5863 }, { "epoch": 1.8404802071481816, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 1.6571, "step": 5864 }, { "epoch": 1.840794068029346, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 1.6922, "step": 5865 }, { "epoch": 1.8411079289105103, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 1.5887, "step": 5866 }, { "epoch": 1.8414217897916747, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 1.7617, "step": 5867 }, { "epoch": 1.8417356506728393, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 1.8468, "step": 5868 }, { "epoch": 1.8420495115540036, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.8676, "step": 5869 }, { "epoch": 1.8423633724351682, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 2.0462, "step": 5870 }, { "epoch": 1.8426772333163326, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 1.8405, "step": 5871 }, { "epoch": 1.842991094197497, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 1.8106, "step": 5872 }, { "epoch": 1.8433049550786613, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 1.8007, "step": 5873 }, { "epoch": 1.8436188159598257, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 2.1436, "step": 5874 }, { "epoch": 1.8439326768409903, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 2.1204, "step": 5875 }, { "epoch": 1.8442465377221546, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 2.3496, "step": 5876 }, { "epoch": 1.8445603986033192, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.5854, "step": 5877 }, { "epoch": 1.8448742594844836, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 1.7404, "step": 5878 }, { "epoch": 1.845188120365648, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 1.9137, "step": 5879 }, { "epoch": 1.8455019812468123, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 1.9283, "step": 5880 }, { "epoch": 1.8458158421279767, "grad_norm": 0.953125, "learning_rate": 0.0002, "loss": 2.3969, "step": 5881 }, { "epoch": 1.846129703009141, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.6984, "step": 5882 }, { "epoch": 1.8464435638903056, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.6577, "step": 5883 }, { "epoch": 1.8467574247714702, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 2.3698, "step": 5884 }, { "epoch": 1.8470712856526346, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 2.6423, "step": 5885 }, { "epoch": 1.847385146533799, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 2.5797, "step": 5886 }, { "epoch": 1.8476990074149633, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 1.6131, "step": 5887 }, { "epoch": 1.8480128682961277, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.3824, "step": 5888 }, { "epoch": 1.848326729177292, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.3644, "step": 5889 }, { "epoch": 1.8486405900584566, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.4024, "step": 5890 }, { "epoch": 1.848954450939621, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.3604, "step": 5891 }, { "epoch": 1.8492683118207855, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.3064, "step": 5892 }, { "epoch": 1.84958217270195, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.2609, "step": 5893 }, { "epoch": 1.8498960335831143, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.2902, "step": 5894 }, { "epoch": 1.8502098944642786, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.3162, "step": 5895 }, { "epoch": 1.850523755345443, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.3751, "step": 5896 }, { "epoch": 1.8508376162266076, "grad_norm": 0.2060546875, "learning_rate": 0.0002, "loss": 1.468, "step": 5897 }, { "epoch": 1.851151477107772, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.3809, "step": 5898 }, { "epoch": 1.8514653379889365, "grad_norm": 0.1845703125, "learning_rate": 0.0002, "loss": 1.2975, "step": 5899 }, { "epoch": 1.851779198870101, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.5673, "step": 5900 }, { "epoch": 1.8520930597512653, "grad_norm": 0.2392578125, "learning_rate": 0.0002, "loss": 1.3654, "step": 5901 }, { "epoch": 1.8524069206324296, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 1.3557, "step": 5902 }, { "epoch": 1.852720781513594, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.2748, "step": 5903 }, { "epoch": 1.8530346423947586, "grad_norm": 0.224609375, "learning_rate": 0.0002, "loss": 1.3098, "step": 5904 }, { "epoch": 1.853348503275923, "grad_norm": 0.208984375, "learning_rate": 0.0002, "loss": 1.1865, "step": 5905 }, { "epoch": 1.8536623641570875, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.4136, "step": 5906 }, { "epoch": 1.8539762250382519, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.4703, "step": 5907 }, { "epoch": 1.8542900859194162, "grad_norm": 0.2470703125, "learning_rate": 0.0002, "loss": 1.1209, "step": 5908 }, { "epoch": 1.8546039468005806, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 1.3871, "step": 5909 }, { "epoch": 1.854917807681745, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 1.6733, "step": 5910 }, { "epoch": 1.8552316685629093, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.4729, "step": 5911 }, { "epoch": 1.855545529444074, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 1.6418, "step": 5912 }, { "epoch": 1.8558593903252385, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 1.572, "step": 5913 }, { "epoch": 1.8561732512064029, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 1.5443, "step": 5914 }, { "epoch": 1.8564871120875672, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.6224, "step": 5915 }, { "epoch": 1.8568009729687316, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 1.7739, "step": 5916 }, { "epoch": 1.857114833849896, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.5957, "step": 5917 }, { "epoch": 1.8574286947310603, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 1.7727, "step": 5918 }, { "epoch": 1.857742555612225, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.8576, "step": 5919 }, { "epoch": 1.8580564164933893, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 1.9285, "step": 5920 }, { "epoch": 1.8583702773745538, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 1.8665, "step": 5921 }, { "epoch": 1.8586841382557182, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.9875, "step": 5922 }, { "epoch": 1.8589979991368826, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 1.8197, "step": 5923 }, { "epoch": 1.859311860018047, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 1.8642, "step": 5924 }, { "epoch": 1.8596257208992113, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 2.4036, "step": 5925 }, { "epoch": 1.8599395817803759, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 1.5666, "step": 5926 }, { "epoch": 1.8602534426615402, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 2.2278, "step": 5927 }, { "epoch": 1.8605673035427048, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.8289, "step": 5928 }, { "epoch": 1.8608811644238692, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 2.0646, "step": 5929 }, { "epoch": 1.8611950253050336, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 1.6013, "step": 5930 }, { "epoch": 1.861508886186198, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.7115, "step": 5931 }, { "epoch": 1.8618227470673623, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 2.0013, "step": 5932 }, { "epoch": 1.8621366079485269, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 1.7879, "step": 5933 }, { "epoch": 1.8624504688296912, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 2.1872, "step": 5934 }, { "epoch": 1.8627643297108558, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 2.1315, "step": 5935 }, { "epoch": 1.8630781905920202, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 2.6237, "step": 5936 }, { "epoch": 1.8633920514731845, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 1.8178, "step": 5937 }, { "epoch": 1.863705912354349, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.2998, "step": 5938 }, { "epoch": 1.8640197732355133, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.3782, "step": 5939 }, { "epoch": 1.8643336341166776, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.399, "step": 5940 }, { "epoch": 1.8646474949978422, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.2741, "step": 5941 }, { "epoch": 1.8649613558790066, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.3048, "step": 5942 }, { "epoch": 1.8652752167601712, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.3547, "step": 5943 }, { "epoch": 1.8655890776413355, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.2876, "step": 5944 }, { "epoch": 1.8659029385224999, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.3567, "step": 5945 }, { "epoch": 1.8662167994036643, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.4003, "step": 5946 }, { "epoch": 1.8665306602848286, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.2482, "step": 5947 }, { "epoch": 1.8668445211659932, "grad_norm": 0.1865234375, "learning_rate": 0.0002, "loss": 1.3462, "step": 5948 }, { "epoch": 1.8671583820471576, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 1.3661, "step": 5949 }, { "epoch": 1.8674722429283221, "grad_norm": 0.1845703125, "learning_rate": 0.0002, "loss": 1.3655, "step": 5950 }, { "epoch": 1.8677861038094865, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.3092, "step": 5951 }, { "epoch": 1.8680999646906509, "grad_norm": 0.234375, "learning_rate": 0.0002, "loss": 1.5488, "step": 5952 }, { "epoch": 1.8684138255718152, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 1.3023, "step": 5953 }, { "epoch": 1.8687276864529796, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.4053, "step": 5954 }, { "epoch": 1.8690415473341442, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.6521, "step": 5955 }, { "epoch": 1.8693554082153085, "grad_norm": 0.22265625, "learning_rate": 0.0002, "loss": 1.4736, "step": 5956 }, { "epoch": 1.8696692690964731, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.253, "step": 5957 }, { "epoch": 1.8699831299776375, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.459, "step": 5958 }, { "epoch": 1.8702969908588019, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.6717, "step": 5959 }, { "epoch": 1.8706108517399662, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.657, "step": 5960 }, { "epoch": 1.8709247126211306, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 1.5178, "step": 5961 }, { "epoch": 1.871238573502295, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 1.833, "step": 5962 }, { "epoch": 1.8715524343834595, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 1.6718, "step": 5963 }, { "epoch": 1.8718662952646241, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.6281, "step": 5964 }, { "epoch": 1.8721801561457885, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 1.4548, "step": 5965 }, { "epoch": 1.8724940170269528, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 1.3447, "step": 5966 }, { "epoch": 1.8728078779081172, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 2.0358, "step": 5967 }, { "epoch": 1.8731217387892816, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 2.1333, "step": 5968 }, { "epoch": 1.873435599670446, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 1.695, "step": 5969 }, { "epoch": 1.8737494605516105, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.848, "step": 5970 }, { "epoch": 1.8740633214327749, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 2.0354, "step": 5971 }, { "epoch": 1.8743771823139395, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 2.0644, "step": 5972 }, { "epoch": 1.8746910431951038, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 1.7386, "step": 5973 }, { "epoch": 1.8750049040762682, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 2.1882, "step": 5974 }, { "epoch": 1.8753187649574325, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 1.6669, "step": 5975 }, { "epoch": 1.875632625838597, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 2.5656, "step": 5976 }, { "epoch": 1.8759464867197615, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 2.1146, "step": 5977 }, { "epoch": 1.8762603476009259, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 2.2268, "step": 5978 }, { "epoch": 1.8765742084820904, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 2.1216, "step": 5979 }, { "epoch": 1.8768880693632548, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 2.2109, "step": 5980 }, { "epoch": 1.8772019302444192, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 2.2491, "step": 5981 }, { "epoch": 1.8775157911255835, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 1.7636, "step": 5982 }, { "epoch": 1.877829652006748, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 2.1231, "step": 5983 }, { "epoch": 1.8781435128879125, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.7568, "step": 5984 }, { "epoch": 1.8781435128879125, "eval_loss": 1.7855916023254395, "eval_runtime": 123.5616, "eval_samples_per_second": 8.093, "eval_steps_per_second": 8.093, "step": 5984 }, { "epoch": 1.8781435128879125, "mmlu_eval_accuracy": 0.42423091770400356, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.3125, "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, "mmlu_eval_accuracy_clinical_knowledge": 0.41379310344827586, "mmlu_eval_accuracy_college_biology": 0.1875, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.36363636363636365, "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, "mmlu_eval_accuracy_college_medicine": 0.4090909090909091, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.2727272727272727, "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.375, "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, "mmlu_eval_accuracy_formal_logic": 0.2857142857142857, "mmlu_eval_accuracy_global_facts": 0.4, "mmlu_eval_accuracy_high_school_biology": 0.3125, "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556, "mmlu_eval_accuracy_high_school_european_history": 0.6666666666666666, "mmlu_eval_accuracy_high_school_geography": 0.5454545454545454, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5714285714285714, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3488372093023256, "mmlu_eval_accuracy_high_school_mathematics": 0.27586206896551724, "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, "mmlu_eval_accuracy_high_school_psychology": 0.6666666666666666, "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, "mmlu_eval_accuracy_high_school_us_history": 0.5, "mmlu_eval_accuracy_high_school_world_history": 0.46153846153846156, "mmlu_eval_accuracy_human_aging": 0.6086956521739131, "mmlu_eval_accuracy_human_sexuality": 0.5, "mmlu_eval_accuracy_international_law": 0.8461538461538461, "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, "mmlu_eval_accuracy_logical_fallacies": 0.5, "mmlu_eval_accuracy_machine_learning": 0.09090909090909091, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.72, "mmlu_eval_accuracy_medical_genetics": 0.6363636363636364, "mmlu_eval_accuracy_miscellaneous": 0.5813953488372093, "mmlu_eval_accuracy_moral_disputes": 0.5526315789473685, "mmlu_eval_accuracy_moral_scenarios": 0.29, "mmlu_eval_accuracy_nutrition": 0.5151515151515151, "mmlu_eval_accuracy_philosophy": 0.4411764705882353, "mmlu_eval_accuracy_prehistory": 0.34285714285714286, "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, "mmlu_eval_accuracy_professional_law": 0.3, "mmlu_eval_accuracy_professional_medicine": 0.3548387096774194, "mmlu_eval_accuracy_professional_psychology": 0.43478260869565216, "mmlu_eval_accuracy_public_relations": 0.6666666666666666, "mmlu_eval_accuracy_security_studies": 0.4444444444444444, "mmlu_eval_accuracy_sociology": 0.4090909090909091, "mmlu_eval_accuracy_us_foreign_policy": 0.5454545454545454, "mmlu_eval_accuracy_virology": 0.3333333333333333, "mmlu_eval_accuracy_world_religions": 0.6842105263157895, "mmlu_loss": 1.1574250328665383, "step": 5984 }, { "epoch": 1.8784573737690768, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 2.0926, "step": 5985 }, { "epoch": 1.8787712346502414, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 2.523, "step": 5986 }, { "epoch": 1.8790850955314058, "grad_norm": 0.2119140625, "learning_rate": 0.0002, "loss": 1.4967, "step": 5987 }, { "epoch": 1.8793989564125702, "grad_norm": 0.1298828125, "learning_rate": 0.0002, "loss": 1.2295, "step": 5988 }, { "epoch": 1.8797128172937345, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.2173, "step": 5989 }, { "epoch": 1.8800266781748989, "grad_norm": 0.19140625, "learning_rate": 0.0002, "loss": 1.5014, "step": 5990 }, { "epoch": 1.8803405390560632, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.3897, "step": 5991 }, { "epoch": 1.8806543999372278, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.1561, "step": 5992 }, { "epoch": 1.8809682608183924, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.3402, "step": 5993 }, { "epoch": 1.8812821216995568, "grad_norm": 0.205078125, "learning_rate": 0.0002, "loss": 1.4062, "step": 5994 }, { "epoch": 1.8815959825807211, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.4649, "step": 5995 }, { "epoch": 1.8819098434618855, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.3717, "step": 5996 }, { "epoch": 1.8822237043430499, "grad_norm": 0.2275390625, "learning_rate": 0.0002, "loss": 1.4346, "step": 5997 }, { "epoch": 1.8825375652242142, "grad_norm": 0.19921875, "learning_rate": 0.0002, "loss": 1.3202, "step": 5998 }, { "epoch": 1.8828514261053788, "grad_norm": 0.208984375, "learning_rate": 0.0002, "loss": 1.3707, "step": 5999 }, { "epoch": 1.8831652869865432, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.3233, "step": 6000 }, { "epoch": 1.8834791478677078, "grad_norm": 0.2001953125, "learning_rate": 0.0002, "loss": 1.2141, "step": 6001 }, { "epoch": 1.8837930087488721, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 1.3269, "step": 6002 }, { "epoch": 1.8841068696300365, "grad_norm": 0.234375, "learning_rate": 0.0002, "loss": 1.3923, "step": 6003 }, { "epoch": 1.8844207305112008, "grad_norm": 0.234375, "learning_rate": 0.0002, "loss": 1.2323, "step": 6004 }, { "epoch": 1.8847345913923652, "grad_norm": 0.240234375, "learning_rate": 0.0002, "loss": 1.424, "step": 6005 }, { "epoch": 1.8850484522735298, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.5137, "step": 6006 }, { "epoch": 1.8853623131546942, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.5626, "step": 6007 }, { "epoch": 1.8856761740358587, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.5102, "step": 6008 }, { "epoch": 1.885990034917023, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.2722, "step": 6009 }, { "epoch": 1.8863038957981875, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.6836, "step": 6010 }, { "epoch": 1.8866177566793518, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 1.611, "step": 6011 }, { "epoch": 1.8869316175605162, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.677, "step": 6012 }, { "epoch": 1.8872454784416808, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 2.1169, "step": 6013 }, { "epoch": 1.8875593393228451, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 1.4547, "step": 6014 }, { "epoch": 1.8878732002040097, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 1.7594, "step": 6015 }, { "epoch": 1.888187061085174, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 1.4907, "step": 6016 }, { "epoch": 1.8885009219663385, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 1.6983, "step": 6017 }, { "epoch": 1.8888147828475028, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 1.7239, "step": 6018 }, { "epoch": 1.8891286437286672, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 1.9725, "step": 6019 }, { "epoch": 1.8894425046098315, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 1.842, "step": 6020 }, { "epoch": 1.8897563654909961, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 1.9049, "step": 6021 }, { "epoch": 1.8900702263721605, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 1.6995, "step": 6022 }, { "epoch": 1.890384087253325, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 2.1398, "step": 6023 }, { "epoch": 1.8906979481344894, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 2.1568, "step": 6024 }, { "epoch": 1.8910118090156538, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 2.4192, "step": 6025 }, { "epoch": 1.8913256698968182, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 2.1758, "step": 6026 }, { "epoch": 1.8916395307779825, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.8815, "step": 6027 }, { "epoch": 1.891953391659147, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.5712, "step": 6028 }, { "epoch": 1.8922672525403115, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 2.0794, "step": 6029 }, { "epoch": 1.892581113421476, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 1.7872, "step": 6030 }, { "epoch": 1.8928949743026404, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.3885, "step": 6031 }, { "epoch": 1.8932088351838048, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 1.874, "step": 6032 }, { "epoch": 1.8935226960649691, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 1.9045, "step": 6033 }, { "epoch": 1.8938365569461335, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.6468, "step": 6034 }, { "epoch": 1.894150417827298, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 1.8801, "step": 6035 }, { "epoch": 1.8944642787084625, "grad_norm": 1.09375, "learning_rate": 0.0002, "loss": 2.6754, "step": 6036 }, { "epoch": 1.894778139589627, "grad_norm": 0.189453125, "learning_rate": 0.0002, "loss": 1.2811, "step": 6037 }, { "epoch": 1.8950920004707914, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.2129, "step": 6038 }, { "epoch": 1.8954058613519558, "grad_norm": 0.1982421875, "learning_rate": 0.0002, "loss": 1.4311, "step": 6039 }, { "epoch": 1.8957197222331201, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.2499, "step": 6040 }, { "epoch": 1.8960335831142845, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.3589, "step": 6041 }, { "epoch": 1.8963474439954489, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.2864, "step": 6042 }, { "epoch": 1.8966613048766134, "grad_norm": 0.2041015625, "learning_rate": 0.0002, "loss": 1.5288, "step": 6043 }, { "epoch": 1.896975165757778, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.2375, "step": 6044 }, { "epoch": 1.8972890266389424, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.3652, "step": 6045 }, { "epoch": 1.8976028875201068, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.3556, "step": 6046 }, { "epoch": 1.8979167484012711, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.2775, "step": 6047 }, { "epoch": 1.8982306092824355, "grad_norm": 0.189453125, "learning_rate": 0.0002, "loss": 1.3814, "step": 6048 }, { "epoch": 1.8985444701635998, "grad_norm": 0.1845703125, "learning_rate": 0.0002, "loss": 1.4055, "step": 6049 }, { "epoch": 1.8988583310447644, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.3125, "step": 6050 }, { "epoch": 1.8991721919259288, "grad_norm": 0.2158203125, "learning_rate": 0.0002, "loss": 1.3607, "step": 6051 }, { "epoch": 1.8994860528070934, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.3836, "step": 6052 }, { "epoch": 1.8997999136882577, "grad_norm": 0.2138671875, "learning_rate": 0.0002, "loss": 1.3888, "step": 6053 }, { "epoch": 1.900113774569422, "grad_norm": 0.1953125, "learning_rate": 0.0002, "loss": 1.2514, "step": 6054 }, { "epoch": 1.9004276354505865, "grad_norm": 0.228515625, "learning_rate": 0.0002, "loss": 1.4204, "step": 6055 }, { "epoch": 1.9007414963317508, "grad_norm": 0.2333984375, "learning_rate": 0.0002, "loss": 1.3758, "step": 6056 }, { "epoch": 1.9010553572129154, "grad_norm": 0.2041015625, "learning_rate": 0.0002, "loss": 1.2802, "step": 6057 }, { "epoch": 1.9013692180940798, "grad_norm": 0.22265625, "learning_rate": 0.0002, "loss": 1.3027, "step": 6058 }, { "epoch": 1.9016830789752444, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 1.3404, "step": 6059 }, { "epoch": 1.9019969398564087, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.1739, "step": 6060 }, { "epoch": 1.902310800737573, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.4307, "step": 6061 }, { "epoch": 1.9026246616187374, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.3368, "step": 6062 }, { "epoch": 1.9029385224999018, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 1.5696, "step": 6063 }, { "epoch": 1.9032523833810664, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 1.3868, "step": 6064 }, { "epoch": 1.9035662442622308, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.7312, "step": 6065 }, { "epoch": 1.9038801051433953, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 1.6214, "step": 6066 }, { "epoch": 1.9041939660245597, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.6447, "step": 6067 }, { "epoch": 1.904507826905724, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.844, "step": 6068 }, { "epoch": 1.9048216877868884, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 1.6066, "step": 6069 }, { "epoch": 1.9051355486680528, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 1.8824, "step": 6070 }, { "epoch": 1.9054494095492172, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 1.9519, "step": 6071 }, { "epoch": 1.9057632704303817, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 1.7469, "step": 6072 }, { "epoch": 1.906077131311546, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 1.9807, "step": 6073 }, { "epoch": 1.9063909921927107, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 2.1678, "step": 6074 }, { "epoch": 1.906704853073875, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 2.1662, "step": 6075 }, { "epoch": 1.9070187139550394, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.9905, "step": 6076 }, { "epoch": 1.9073325748362038, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 2.1265, "step": 6077 }, { "epoch": 1.9076464357173681, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 2.4555, "step": 6078 }, { "epoch": 1.9079602965985327, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 2.7651, "step": 6079 }, { "epoch": 1.908274157479697, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 1.8466, "step": 6080 }, { "epoch": 1.9085880183608617, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.9321, "step": 6081 }, { "epoch": 1.908901879242026, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 2.6196, "step": 6082 }, { "epoch": 1.9092157401231904, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 1.7671, "step": 6083 }, { "epoch": 1.9095296010043548, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.9945, "step": 6084 }, { "epoch": 1.9098434618855191, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 1.8311, "step": 6085 }, { "epoch": 1.9101573227666837, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 2.6576, "step": 6086 }, { "epoch": 1.910471183647848, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.3709, "step": 6087 }, { "epoch": 1.9107850445290127, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.3617, "step": 6088 }, { "epoch": 1.911098905410177, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.3939, "step": 6089 }, { "epoch": 1.9114127662913414, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.2023, "step": 6090 }, { "epoch": 1.9117266271725057, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.1513, "step": 6091 }, { "epoch": 1.91204048805367, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.3465, "step": 6092 }, { "epoch": 1.9123543489348345, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.3369, "step": 6093 }, { "epoch": 1.912668209815999, "grad_norm": 0.1884765625, "learning_rate": 0.0002, "loss": 1.5491, "step": 6094 }, { "epoch": 1.9129820706971636, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 1.5203, "step": 6095 }, { "epoch": 1.913295931578328, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 1.4288, "step": 6096 }, { "epoch": 1.9136097924594924, "grad_norm": 0.1845703125, "learning_rate": 0.0002, "loss": 1.3437, "step": 6097 }, { "epoch": 1.9139236533406567, "grad_norm": 0.2001953125, "learning_rate": 0.0002, "loss": 1.5743, "step": 6098 }, { "epoch": 1.914237514221821, "grad_norm": 0.197265625, "learning_rate": 0.0002, "loss": 1.1673, "step": 6099 }, { "epoch": 1.9145513751029855, "grad_norm": 0.1845703125, "learning_rate": 0.0002, "loss": 1.4265, "step": 6100 }, { "epoch": 1.91486523598415, "grad_norm": 0.19921875, "learning_rate": 0.0002, "loss": 1.2792, "step": 6101 }, { "epoch": 1.9151790968653144, "grad_norm": 0.205078125, "learning_rate": 0.0002, "loss": 1.3234, "step": 6102 }, { "epoch": 1.915492957746479, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.3086, "step": 6103 }, { "epoch": 1.9158068186276433, "grad_norm": 0.1982421875, "learning_rate": 0.0002, "loss": 1.1944, "step": 6104 }, { "epoch": 1.9161206795088077, "grad_norm": 0.216796875, "learning_rate": 0.0002, "loss": 1.3692, "step": 6105 }, { "epoch": 1.916434540389972, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 1.3932, "step": 6106 }, { "epoch": 1.9167484012711364, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.4377, "step": 6107 }, { "epoch": 1.917062262152301, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.469, "step": 6108 }, { "epoch": 1.9173761230334654, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.3331, "step": 6109 }, { "epoch": 1.91768998391463, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.3366, "step": 6110 }, { "epoch": 1.9180038447957943, "grad_norm": 0.220703125, "learning_rate": 0.0002, "loss": 1.1168, "step": 6111 }, { "epoch": 1.9183177056769587, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.3462, "step": 6112 }, { "epoch": 1.918631566558123, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.4024, "step": 6113 }, { "epoch": 1.9189454274392874, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 1.5792, "step": 6114 }, { "epoch": 1.919259288320452, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 1.8491, "step": 6115 }, { "epoch": 1.9195731492016164, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 1.8856, "step": 6116 }, { "epoch": 1.919887010082781, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.7271, "step": 6117 }, { "epoch": 1.9202008709639453, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 1.8991, "step": 6118 }, { "epoch": 1.9205147318451097, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 1.7717, "step": 6119 }, { "epoch": 1.920828592726274, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 2.1364, "step": 6120 }, { "epoch": 1.9211424536074384, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.7342, "step": 6121 }, { "epoch": 1.9214563144886028, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 1.7976, "step": 6122 }, { "epoch": 1.9217701753697674, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.7846, "step": 6123 }, { "epoch": 1.922084036250932, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 2.0553, "step": 6124 }, { "epoch": 1.9223978971320963, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 1.7303, "step": 6125 }, { "epoch": 1.9227117580132607, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 2.1597, "step": 6126 }, { "epoch": 1.923025618894425, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 1.9155, "step": 6127 }, { "epoch": 1.9233394797755894, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 1.9295, "step": 6128 }, { "epoch": 1.9236533406567538, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 2.2244, "step": 6129 }, { "epoch": 1.9239672015379183, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.6728, "step": 6130 }, { "epoch": 1.9242810624190827, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 1.9492, "step": 6131 }, { "epoch": 1.9245949233002473, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 1.8253, "step": 6132 }, { "epoch": 1.9249087841814116, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 1.8183, "step": 6133 }, { "epoch": 1.925222645062576, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 1.7386, "step": 6134 }, { "epoch": 1.9255365059437404, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 1.9165, "step": 6135 }, { "epoch": 1.9258503668249047, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 3.2698, "step": 6136 }, { "epoch": 1.9261642277060693, "grad_norm": 0.2109375, "learning_rate": 0.0002, "loss": 1.5735, "step": 6137 }, { "epoch": 1.9264780885872337, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.42, "step": 6138 }, { "epoch": 1.9267919494683983, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.321, "step": 6139 }, { "epoch": 1.9271058103495626, "grad_norm": 0.19921875, "learning_rate": 0.0002, "loss": 1.3791, "step": 6140 }, { "epoch": 1.927419671230727, "grad_norm": 0.1953125, "learning_rate": 0.0002, "loss": 1.4568, "step": 6141 }, { "epoch": 1.9277335321118914, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.3315, "step": 6142 }, { "epoch": 1.9280473929930557, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.347, "step": 6143 }, { "epoch": 1.9283612538742203, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.3535, "step": 6144 }, { "epoch": 1.9286751147553847, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.261, "step": 6145 }, { "epoch": 1.9289889756365493, "grad_norm": 0.1953125, "learning_rate": 0.0002, "loss": 1.3203, "step": 6146 }, { "epoch": 1.9293028365177136, "grad_norm": 0.1865234375, "learning_rate": 0.0002, "loss": 1.4292, "step": 6147 }, { "epoch": 1.929616697398878, "grad_norm": 0.1884765625, "learning_rate": 0.0002, "loss": 1.1916, "step": 6148 }, { "epoch": 1.9299305582800423, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.3454, "step": 6149 }, { "epoch": 1.9302444191612067, "grad_norm": 0.2021484375, "learning_rate": 0.0002, "loss": 1.26, "step": 6150 }, { "epoch": 1.930558280042371, "grad_norm": 0.203125, "learning_rate": 0.0002, "loss": 1.4168, "step": 6151 }, { "epoch": 1.9308721409235357, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.4485, "step": 6152 }, { "epoch": 1.9311860018047, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 1.3304, "step": 6153 }, { "epoch": 1.9314998626858646, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 1.4071, "step": 6154 }, { "epoch": 1.931813723567029, "grad_norm": 0.232421875, "learning_rate": 0.0002, "loss": 1.4118, "step": 6155 }, { "epoch": 1.9321275844481933, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.4228, "step": 6156 }, { "epoch": 1.9324414453293577, "grad_norm": 0.2451171875, "learning_rate": 0.0002, "loss": 1.2848, "step": 6157 }, { "epoch": 1.932755306210522, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.3849, "step": 6158 }, { "epoch": 1.9330691670916866, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.4587, "step": 6159 }, { "epoch": 1.933383027972851, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.4131, "step": 6160 }, { "epoch": 1.9336968888540156, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.3038, "step": 6161 }, { "epoch": 1.93401074973518, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 1.6321, "step": 6162 }, { "epoch": 1.9343246106163443, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 1.6609, "step": 6163 }, { "epoch": 1.9346384714975087, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 1.617, "step": 6164 }, { "epoch": 1.934952332378673, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 1.6255, "step": 6165 }, { "epoch": 1.9352661932598376, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 1.6832, "step": 6166 }, { "epoch": 1.935580054141002, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.4293, "step": 6167 }, { "epoch": 1.9358939150221666, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 1.4334, "step": 6168 }, { "epoch": 1.936207775903331, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 1.6668, "step": 6169 }, { "epoch": 1.9365216367844953, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 1.9475, "step": 6170 }, { "epoch": 1.9368354976656597, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.7741, "step": 6171 }, { "epoch": 1.9368354976656597, "eval_loss": 1.78682541847229, "eval_runtime": 152.5449, "eval_samples_per_second": 6.555, "eval_steps_per_second": 6.555, "step": 6171 }, { "epoch": 1.9368354976656597, "mmlu_eval_accuracy": 0.4197257727194544, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.6363636363636364, "mmlu_eval_accuracy_clinical_knowledge": 0.4482758620689655, "mmlu_eval_accuracy_college_biology": 0.1875, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.2727272727272727, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.3181818181818182, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.2727272727272727, "mmlu_eval_accuracy_conceptual_physics": 0.34615384615384615, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.3125, "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, "mmlu_eval_accuracy_formal_logic": 0.2857142857142857, "mmlu_eval_accuracy_global_facts": 0.5, "mmlu_eval_accuracy_high_school_biology": 0.25, "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365, "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666, "mmlu_eval_accuracy_high_school_european_history": 0.7222222222222222, "mmlu_eval_accuracy_high_school_geography": 0.6363636363636364, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5238095238095238, "mmlu_eval_accuracy_high_school_macroeconomics": 0.4418604651162791, "mmlu_eval_accuracy_high_school_mathematics": 0.27586206896551724, "mmlu_eval_accuracy_high_school_microeconomics": 0.4230769230769231, "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, "mmlu_eval_accuracy_high_school_psychology": 0.65, "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, "mmlu_eval_accuracy_high_school_us_history": 0.5454545454545454, "mmlu_eval_accuracy_high_school_world_history": 0.46153846153846156, "mmlu_eval_accuracy_human_aging": 0.6086956521739131, "mmlu_eval_accuracy_human_sexuality": 0.3333333333333333, "mmlu_eval_accuracy_international_law": 0.7692307692307693, "mmlu_eval_accuracy_jurisprudence": 0.45454545454545453, "mmlu_eval_accuracy_logical_fallacies": 0.5555555555555556, "mmlu_eval_accuracy_machine_learning": 0.09090909090909091, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.76, "mmlu_eval_accuracy_medical_genetics": 0.6363636363636364, "mmlu_eval_accuracy_miscellaneous": 0.5, "mmlu_eval_accuracy_moral_disputes": 0.47368421052631576, "mmlu_eval_accuracy_moral_scenarios": 0.3, "mmlu_eval_accuracy_nutrition": 0.42424242424242425, "mmlu_eval_accuracy_philosophy": 0.47058823529411764, "mmlu_eval_accuracy_prehistory": 0.3142857142857143, "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, "mmlu_eval_accuracy_professional_law": 0.3, "mmlu_eval_accuracy_professional_medicine": 0.3870967741935484, "mmlu_eval_accuracy_professional_psychology": 0.43478260869565216, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.4444444444444444, "mmlu_eval_accuracy_sociology": 0.45454545454545453, "mmlu_eval_accuracy_us_foreign_policy": 0.6363636363636364, "mmlu_eval_accuracy_virology": 0.3888888888888889, "mmlu_eval_accuracy_world_religions": 0.6842105263157895, "mmlu_loss": 1.213808760309749, "step": 6171 }, { "epoch": 1.937149358546824, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 1.8713, "step": 6172 }, { "epoch": 1.9374632194279884, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 1.7965, "step": 6173 }, { "epoch": 1.937777080309153, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 1.7159, "step": 6174 }, { "epoch": 1.9380909411903176, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 1.9154, "step": 6175 }, { "epoch": 1.938404802071482, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 1.9675, "step": 6176 }, { "epoch": 1.9387186629526463, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 2.4266, "step": 6177 }, { "epoch": 1.9390325238338106, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 2.2141, "step": 6178 }, { "epoch": 1.939346384714975, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.9954, "step": 6179 }, { "epoch": 1.9396602455961394, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 1.7545, "step": 6180 }, { "epoch": 1.939974106477304, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 2.0048, "step": 6181 }, { "epoch": 1.9402879673584683, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 1.7197, "step": 6182 }, { "epoch": 1.940601828239633, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 1.9448, "step": 6183 }, { "epoch": 1.9409156891207973, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.8629, "step": 6184 }, { "epoch": 1.9412295500019616, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 1.8311, "step": 6185 }, { "epoch": 1.941543410883126, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 2.6935, "step": 6186 }, { "epoch": 1.9418572717642903, "grad_norm": 0.20703125, "learning_rate": 0.0002, "loss": 1.483, "step": 6187 }, { "epoch": 1.942171132645455, "grad_norm": 0.130859375, "learning_rate": 0.0002, "loss": 1.2082, "step": 6188 }, { "epoch": 1.9424849935266193, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.3822, "step": 6189 }, { "epoch": 1.9427988544077839, "grad_norm": 0.2041015625, "learning_rate": 0.0002, "loss": 1.3582, "step": 6190 }, { "epoch": 1.9431127152889482, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.295, "step": 6191 }, { "epoch": 1.9434265761701126, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.2375, "step": 6192 }, { "epoch": 1.943740437051277, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.2288, "step": 6193 }, { "epoch": 1.9440542979324413, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.3575, "step": 6194 }, { "epoch": 1.944368158813606, "grad_norm": 0.1953125, "learning_rate": 0.0002, "loss": 1.5366, "step": 6195 }, { "epoch": 1.9446820196947703, "grad_norm": 0.185546875, "learning_rate": 0.0002, "loss": 1.2837, "step": 6196 }, { "epoch": 1.9449958805759349, "grad_norm": 0.2265625, "learning_rate": 0.0002, "loss": 1.3292, "step": 6197 }, { "epoch": 1.9453097414570992, "grad_norm": 0.205078125, "learning_rate": 0.0002, "loss": 1.3007, "step": 6198 }, { "epoch": 1.9456236023382636, "grad_norm": 0.2109375, "learning_rate": 0.0002, "loss": 1.3888, "step": 6199 }, { "epoch": 1.945937463219428, "grad_norm": 0.21484375, "learning_rate": 0.0002, "loss": 1.3016, "step": 6200 }, { "epoch": 1.9462513241005923, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.254, "step": 6201 }, { "epoch": 1.9465651849817567, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 1.414, "step": 6202 }, { "epoch": 1.9468790458629213, "grad_norm": 0.2177734375, "learning_rate": 0.0002, "loss": 1.2443, "step": 6203 }, { "epoch": 1.9471929067440858, "grad_norm": 0.2236328125, "learning_rate": 0.0002, "loss": 1.3593, "step": 6204 }, { "epoch": 1.9475067676252502, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.3608, "step": 6205 }, { "epoch": 1.9478206285064146, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.4143, "step": 6206 }, { "epoch": 1.948134489387579, "grad_norm": 0.212890625, "learning_rate": 0.0002, "loss": 1.1969, "step": 6207 }, { "epoch": 1.9484483502687433, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.3597, "step": 6208 }, { "epoch": 1.9487622111499077, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.3017, "step": 6209 }, { "epoch": 1.9490760720310722, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.3243, "step": 6210 }, { "epoch": 1.9493899329122366, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.4498, "step": 6211 }, { "epoch": 1.9497037937934012, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.5024, "step": 6212 }, { "epoch": 1.9500176546745656, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.7039, "step": 6213 }, { "epoch": 1.95033151555573, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 1.73, "step": 6214 }, { "epoch": 1.9506453764368943, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 1.7362, "step": 6215 }, { "epoch": 1.9509592373180586, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 1.7638, "step": 6216 }, { "epoch": 1.9512730981992232, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 1.896, "step": 6217 }, { "epoch": 1.9515869590803876, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 1.7783, "step": 6218 }, { "epoch": 1.9519008199615522, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 1.5481, "step": 6219 }, { "epoch": 1.9522146808427165, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.6818, "step": 6220 }, { "epoch": 1.952528541723881, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 1.8717, "step": 6221 }, { "epoch": 1.9528424026050453, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 1.8411, "step": 6222 }, { "epoch": 1.9531562634862096, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 2.1213, "step": 6223 }, { "epoch": 1.9534701243673742, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 2.0931, "step": 6224 }, { "epoch": 1.9537839852485386, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 1.8087, "step": 6225 }, { "epoch": 1.9540978461297032, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 1.9759, "step": 6226 }, { "epoch": 1.9544117070108675, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 2.0105, "step": 6227 }, { "epoch": 1.954725567892032, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 2.4152, "step": 6228 }, { "epoch": 1.9550394287731963, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 2.1308, "step": 6229 }, { "epoch": 1.9553532896543606, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 1.7342, "step": 6230 }, { "epoch": 1.955667150535525, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 1.3799, "step": 6231 }, { "epoch": 1.9559810114166896, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 2.0526, "step": 6232 }, { "epoch": 1.956294872297854, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.8936, "step": 6233 }, { "epoch": 1.9566087331790185, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.8216, "step": 6234 }, { "epoch": 1.9569225940601829, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 2.5648, "step": 6235 }, { "epoch": 1.9572364549413472, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 2.8273, "step": 6236 }, { "epoch": 1.9575503158225116, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 1.4938, "step": 6237 }, { "epoch": 1.957864176703676, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.4521, "step": 6238 }, { "epoch": 1.9581780375848405, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.2935, "step": 6239 }, { "epoch": 1.958491898466005, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.2917, "step": 6240 }, { "epoch": 1.9588057593471695, "grad_norm": 0.2158203125, "learning_rate": 0.0002, "loss": 1.3696, "step": 6241 }, { "epoch": 1.9591196202283339, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.4037, "step": 6242 }, { "epoch": 1.9594334811094982, "grad_norm": 0.20703125, "learning_rate": 0.0002, "loss": 1.3007, "step": 6243 }, { "epoch": 1.9597473419906626, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.3639, "step": 6244 }, { "epoch": 1.960061202871827, "grad_norm": 0.185546875, "learning_rate": 0.0002, "loss": 1.3895, "step": 6245 }, { "epoch": 1.9603750637529915, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.2628, "step": 6246 }, { "epoch": 1.960688924634156, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.3607, "step": 6247 }, { "epoch": 1.9610027855153205, "grad_norm": 0.2001953125, "learning_rate": 0.0002, "loss": 1.4322, "step": 6248 }, { "epoch": 1.9613166463964848, "grad_norm": 0.1875, "learning_rate": 0.0002, "loss": 1.2913, "step": 6249 }, { "epoch": 1.9616305072776492, "grad_norm": 0.2177734375, "learning_rate": 0.0002, "loss": 1.241, "step": 6250 }, { "epoch": 1.9619443681588136, "grad_norm": 0.240234375, "learning_rate": 0.0002, "loss": 1.3618, "step": 6251 }, { "epoch": 1.962258229039978, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.2447, "step": 6252 }, { "epoch": 1.9625720899211423, "grad_norm": 0.23828125, "learning_rate": 0.0002, "loss": 1.2828, "step": 6253 }, { "epoch": 1.9628859508023069, "grad_norm": 0.2177734375, "learning_rate": 0.0002, "loss": 1.2431, "step": 6254 }, { "epoch": 1.9631998116834715, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.298, "step": 6255 }, { "epoch": 1.9635136725646358, "grad_norm": 0.216796875, "learning_rate": 0.0002, "loss": 1.2924, "step": 6256 }, { "epoch": 1.9638275334458002, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 1.6099, "step": 6257 }, { "epoch": 1.9641413943269646, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.3266, "step": 6258 }, { "epoch": 1.964455255208129, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.3105, "step": 6259 }, { "epoch": 1.9647691160892933, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.4488, "step": 6260 }, { "epoch": 1.9650829769704579, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 1.7142, "step": 6261 }, { "epoch": 1.9653968378516222, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 1.5408, "step": 6262 }, { "epoch": 1.9657106987327868, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 1.6264, "step": 6263 }, { "epoch": 1.9660245596139512, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.5745, "step": 6264 }, { "epoch": 1.9663384204951155, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 2.0871, "step": 6265 }, { "epoch": 1.96665228137628, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.5642, "step": 6266 }, { "epoch": 1.9669661422574443, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 1.6649, "step": 6267 }, { "epoch": 1.9672800031386088, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 1.9547, "step": 6268 }, { "epoch": 1.9675938640197732, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 1.791, "step": 6269 }, { "epoch": 1.9679077249009378, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 2.2883, "step": 6270 }, { "epoch": 1.9682215857821022, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 2.1999, "step": 6271 }, { "epoch": 1.9685354466632665, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 2.0373, "step": 6272 }, { "epoch": 1.9688493075444309, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 1.9539, "step": 6273 }, { "epoch": 1.9691631684255952, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 2.3226, "step": 6274 }, { "epoch": 1.9694770293067598, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 1.5914, "step": 6275 }, { "epoch": 1.9697908901879242, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.8675, "step": 6276 }, { "epoch": 1.9701047510690888, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 1.9848, "step": 6277 }, { "epoch": 1.9704186119502531, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 1.6943, "step": 6278 }, { "epoch": 1.9707324728314175, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 2.2691, "step": 6279 }, { "epoch": 1.9710463337125819, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 2.1909, "step": 6280 }, { "epoch": 1.9713601945937462, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 1.7719, "step": 6281 }, { "epoch": 1.9716740554749106, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 1.9302, "step": 6282 }, { "epoch": 1.9719879163560752, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 2.1265, "step": 6283 }, { "epoch": 1.9723017772372395, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.4069, "step": 6284 }, { "epoch": 1.9726156381184041, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 2.089, "step": 6285 }, { "epoch": 1.9729294989995685, "grad_norm": 0.9140625, "learning_rate": 0.0002, "loss": 2.8715, "step": 6286 }, { "epoch": 1.9732433598807329, "grad_norm": 0.2041015625, "learning_rate": 0.0002, "loss": 1.5183, "step": 6287 }, { "epoch": 1.9735572207618972, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.378, "step": 6288 }, { "epoch": 1.9738710816430616, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.4105, "step": 6289 }, { "epoch": 1.9741849425242262, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.4007, "step": 6290 }, { "epoch": 1.9744988034053905, "grad_norm": 0.1845703125, "learning_rate": 0.0002, "loss": 1.4438, "step": 6291 }, { "epoch": 1.974812664286555, "grad_norm": 0.1865234375, "learning_rate": 0.0002, "loss": 1.3228, "step": 6292 }, { "epoch": 1.9751265251677195, "grad_norm": 0.20703125, "learning_rate": 0.0002, "loss": 1.4804, "step": 6293 }, { "epoch": 1.9754403860488838, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.4462, "step": 6294 }, { "epoch": 1.9757542469300482, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.2349, "step": 6295 }, { "epoch": 1.9760681078112126, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.2008, "step": 6296 }, { "epoch": 1.9763819686923771, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 1.3636, "step": 6297 }, { "epoch": 1.9766958295735415, "grad_norm": 0.2021484375, "learning_rate": 0.0002, "loss": 1.3708, "step": 6298 }, { "epoch": 1.977009690454706, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.4474, "step": 6299 }, { "epoch": 1.9773235513358705, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.3537, "step": 6300 }, { "epoch": 1.9776374122170348, "grad_norm": 0.2021484375, "learning_rate": 0.0002, "loss": 1.3297, "step": 6301 }, { "epoch": 1.9779512730981992, "grad_norm": 0.2265625, "learning_rate": 0.0002, "loss": 1.5156, "step": 6302 }, { "epoch": 1.9782651339793635, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.1508, "step": 6303 }, { "epoch": 1.978578994860528, "grad_norm": 0.2265625, "learning_rate": 0.0002, "loss": 1.2305, "step": 6304 }, { "epoch": 1.9788928557416925, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.2829, "step": 6305 }, { "epoch": 1.979206716622857, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.318, "step": 6306 }, { "epoch": 1.9795205775040214, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.3873, "step": 6307 }, { "epoch": 1.9798344383851858, "grad_norm": 0.244140625, "learning_rate": 0.0002, "loss": 1.2953, "step": 6308 }, { "epoch": 1.9801482992663502, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 1.4672, "step": 6309 }, { "epoch": 1.9804621601475145, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.4184, "step": 6310 }, { "epoch": 1.980776021028679, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 1.4374, "step": 6311 }, { "epoch": 1.9810898819098435, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.4165, "step": 6312 }, { "epoch": 1.9814037427910078, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 1.7993, "step": 6313 }, { "epoch": 1.9817176036721724, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.467, "step": 6314 }, { "epoch": 1.9820314645533368, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.8822, "step": 6315 }, { "epoch": 1.9823453254345011, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.7527, "step": 6316 }, { "epoch": 1.9826591863156655, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 1.7072, "step": 6317 }, { "epoch": 1.9829730471968299, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 1.754, "step": 6318 }, { "epoch": 1.9832869080779945, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 1.5057, "step": 6319 }, { "epoch": 1.9836007689591588, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 2.2067, "step": 6320 }, { "epoch": 1.9839146298403234, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 1.9669, "step": 6321 }, { "epoch": 1.9842284907214878, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 1.9307, "step": 6322 }, { "epoch": 1.9845423516026521, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 2.0883, "step": 6323 }, { "epoch": 1.9848562124838165, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 2.1274, "step": 6324 }, { "epoch": 1.9851700733649809, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 1.9849, "step": 6325 }, { "epoch": 1.9854839342461454, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 1.7451, "step": 6326 }, { "epoch": 1.9857977951273098, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 1.8038, "step": 6327 }, { "epoch": 1.9861116560084744, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 2.4209, "step": 6328 }, { "epoch": 1.9864255168896388, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 1.948, "step": 6329 }, { "epoch": 1.9867393777708031, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 2.096, "step": 6330 }, { "epoch": 1.9870532386519675, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.7269, "step": 6331 }, { "epoch": 1.9873670995331318, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 1.4452, "step": 6332 }, { "epoch": 1.9876809604142962, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 1.6012, "step": 6333 }, { "epoch": 1.9879948212954608, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 2.0283, "step": 6334 }, { "epoch": 1.9883086821766254, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 2.1029, "step": 6335 }, { "epoch": 1.9886225430577897, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 2.6477, "step": 6336 }, { "epoch": 1.988936403938954, "grad_norm": 0.2255859375, "learning_rate": 0.0002, "loss": 1.549, "step": 6337 }, { "epoch": 1.9892502648201185, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.464, "step": 6338 }, { "epoch": 1.9895641257012828, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.4544, "step": 6339 }, { "epoch": 1.9898779865824472, "grad_norm": 0.203125, "learning_rate": 0.0002, "loss": 1.4293, "step": 6340 }, { "epoch": 1.9901918474636118, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.3721, "step": 6341 }, { "epoch": 1.9905057083447761, "grad_norm": 0.1953125, "learning_rate": 0.0002, "loss": 1.4958, "step": 6342 }, { "epoch": 1.9908195692259407, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.2461, "step": 6343 }, { "epoch": 1.991133430107105, "grad_norm": 0.185546875, "learning_rate": 0.0002, "loss": 1.3111, "step": 6344 }, { "epoch": 1.9914472909882694, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.3667, "step": 6345 }, { "epoch": 1.9917611518694338, "grad_norm": 0.2001953125, "learning_rate": 0.0002, "loss": 1.3126, "step": 6346 }, { "epoch": 1.9920750127505982, "grad_norm": 0.224609375, "learning_rate": 0.0002, "loss": 1.4414, "step": 6347 }, { "epoch": 1.9923888736317628, "grad_norm": 0.224609375, "learning_rate": 0.0002, "loss": 1.4253, "step": 6348 }, { "epoch": 1.9927027345129271, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.2938, "step": 6349 }, { "epoch": 1.9930165953940917, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 1.5293, "step": 6350 }, { "epoch": 1.993330456275256, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 1.4518, "step": 6351 }, { "epoch": 1.9936443171564204, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.3971, "step": 6352 }, { "epoch": 1.9939581780375848, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.5387, "step": 6353 }, { "epoch": 1.9942720389187492, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 1.8592, "step": 6354 }, { "epoch": 1.9945858997999137, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 1.5055, "step": 6355 }, { "epoch": 1.994899760681078, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 1.6136, "step": 6356 }, { "epoch": 1.9952136215622427, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 1.7147, "step": 6357 }, { "epoch": 1.995527482443407, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 1.4932, "step": 6358 }, { "epoch": 1.995527482443407, "eval_loss": 1.7805804014205933, "eval_runtime": 155.909, "eval_samples_per_second": 6.414, "eval_steps_per_second": 6.414, "step": 6358 }, { "epoch": 1.995527482443407, "mmlu_eval_accuracy": 0.409254918544326, "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.4375, "mmlu_eval_accuracy_business_ethics": 0.6363636363636364, "mmlu_eval_accuracy_clinical_knowledge": 0.3793103448275862, "mmlu_eval_accuracy_college_biology": 0.1875, "mmlu_eval_accuracy_college_chemistry": 0.25, "mmlu_eval_accuracy_college_computer_science": 0.45454545454545453, "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.2727272727272727, "mmlu_eval_accuracy_conceptual_physics": 0.38461538461538464, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.375, "mmlu_eval_accuracy_elementary_mathematics": 0.3170731707317073, "mmlu_eval_accuracy_formal_logic": 0.35714285714285715, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.28125, "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365, "mmlu_eval_accuracy_high_school_computer_science": 0.4444444444444444, "mmlu_eval_accuracy_high_school_european_history": 0.5555555555555556, "mmlu_eval_accuracy_high_school_geography": 0.5454545454545454, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5714285714285714, "mmlu_eval_accuracy_high_school_macroeconomics": 0.37209302325581395, "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862, "mmlu_eval_accuracy_high_school_microeconomics": 0.23076923076923078, "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, "mmlu_eval_accuracy_high_school_psychology": 0.55, "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, "mmlu_eval_accuracy_high_school_us_history": 0.5454545454545454, "mmlu_eval_accuracy_high_school_world_history": 0.3076923076923077, "mmlu_eval_accuracy_human_aging": 0.6521739130434783, "mmlu_eval_accuracy_human_sexuality": 0.25, "mmlu_eval_accuracy_international_law": 0.6923076923076923, "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, "mmlu_eval_accuracy_logical_fallacies": 0.6111111111111112, "mmlu_eval_accuracy_machine_learning": 0.18181818181818182, "mmlu_eval_accuracy_management": 0.2727272727272727, "mmlu_eval_accuracy_marketing": 0.76, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.5465116279069767, "mmlu_eval_accuracy_moral_disputes": 0.3684210526315789, "mmlu_eval_accuracy_moral_scenarios": 0.26, "mmlu_eval_accuracy_nutrition": 0.42424242424242425, "mmlu_eval_accuracy_philosophy": 0.4411764705882353, "mmlu_eval_accuracy_prehistory": 0.3142857142857143, "mmlu_eval_accuracy_professional_accounting": 0.5161290322580645, "mmlu_eval_accuracy_professional_law": 0.3235294117647059, "mmlu_eval_accuracy_professional_medicine": 0.45161290322580644, "mmlu_eval_accuracy_professional_psychology": 0.4057971014492754, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.5454545454545454, "mmlu_eval_accuracy_us_foreign_policy": 0.5454545454545454, "mmlu_eval_accuracy_virology": 0.3888888888888889, "mmlu_eval_accuracy_world_religions": 0.6842105263157895, "mmlu_loss": 1.3345751797039473, "step": 6358 }, { "epoch": 1.9958413433245714, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 1.6581, "step": 6359 }, { "epoch": 1.9961552042057358, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 1.9552, "step": 6360 }, { "epoch": 1.9964690650869001, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 1.8435, "step": 6361 }, { "epoch": 1.9967829259680645, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 2.5541, "step": 6362 }, { "epoch": 1.997096786849229, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 2.1237, "step": 6363 }, { "epoch": 1.9974106477303935, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 2.0774, "step": 6364 }, { "epoch": 1.997724508611558, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 2.0212, "step": 6365 }, { "epoch": 1.9980383694927224, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 2.1173, "step": 6366 }, { "epoch": 1.9983522303738868, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 1.6951, "step": 6367 }, { "epoch": 1.9986660912550511, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 2.1471, "step": 6368 }, { "epoch": 1.9989799521362155, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 2.0464, "step": 6369 }, { "epoch": 1.99929381301738, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.8361, "step": 6370 }, { "epoch": 1.9996076738985444, "grad_norm": 1.0, "learning_rate": 0.0002, "loss": 2.2817, "step": 6371 }, { "epoch": 1.999921534779709, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 2.1017, "step": 6372 }, { "epoch": 2.0002353956608734, "grad_norm": 0.2255859375, "learning_rate": 0.0002, "loss": 1.5499, "step": 6373 }, { "epoch": 2.0005492565420377, "grad_norm": 0.12353515625, "learning_rate": 0.0002, "loss": 1.3603, "step": 6374 }, { "epoch": 2.000863117423202, "grad_norm": 0.125, "learning_rate": 0.0002, "loss": 1.1976, "step": 6375 }, { "epoch": 2.0011769783043665, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.2112, "step": 6376 }, { "epoch": 2.001490839185531, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.2047, "step": 6377 }, { "epoch": 2.0018047000666956, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.3085, "step": 6378 }, { "epoch": 2.00211856094786, "grad_norm": 0.185546875, "learning_rate": 0.0002, "loss": 1.2039, "step": 6379 }, { "epoch": 2.0024324218290244, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.2814, "step": 6380 }, { "epoch": 2.0027462827101887, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.3825, "step": 6381 }, { "epoch": 2.003060143591353, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.281, "step": 6382 }, { "epoch": 2.0033740044725175, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.2624, "step": 6383 }, { "epoch": 2.003687865353682, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.1571, "step": 6384 }, { "epoch": 2.0040017262348466, "grad_norm": 0.2138671875, "learning_rate": 0.0002, "loss": 1.2529, "step": 6385 }, { "epoch": 2.004315587116011, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.1393, "step": 6386 }, { "epoch": 2.0046294479971754, "grad_norm": 0.1982421875, "learning_rate": 0.0002, "loss": 1.1735, "step": 6387 }, { "epoch": 2.0049433088783397, "grad_norm": 0.1865234375, "learning_rate": 0.0002, "loss": 1.0209, "step": 6388 }, { "epoch": 2.005257169759504, "grad_norm": 0.2041015625, "learning_rate": 0.0002, "loss": 1.2537, "step": 6389 }, { "epoch": 2.0055710306406684, "grad_norm": 0.2275390625, "learning_rate": 0.0002, "loss": 1.2957, "step": 6390 }, { "epoch": 2.005884891521833, "grad_norm": 0.2333984375, "learning_rate": 0.0002, "loss": 1.1899, "step": 6391 }, { "epoch": 2.006198752402997, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.1961, "step": 6392 }, { "epoch": 2.006512613284162, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.2249, "step": 6393 }, { "epoch": 2.0068264741653263, "grad_norm": 0.2490234375, "learning_rate": 0.0002, "loss": 1.1087, "step": 6394 }, { "epoch": 2.0071403350464907, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.3485, "step": 6395 }, { "epoch": 2.007454195927655, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.1663, "step": 6396 }, { "epoch": 2.0077680568088194, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.168, "step": 6397 }, { "epoch": 2.008081917689984, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.1941, "step": 6398 }, { "epoch": 2.008395778571148, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 1.281, "step": 6399 }, { "epoch": 2.008709639452313, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 1.2982, "step": 6400 }, { "epoch": 2.0090235003334773, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 1.223, "step": 6401 }, { "epoch": 2.0093373612146417, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.5336, "step": 6402 }, { "epoch": 2.009651222095806, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.5812, "step": 6403 }, { "epoch": 2.0099650829769704, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 1.1882, "step": 6404 }, { "epoch": 2.0102789438581348, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 1.5586, "step": 6405 }, { "epoch": 2.010592804739299, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.7124, "step": 6406 }, { "epoch": 2.010906665620464, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 1.5269, "step": 6407 }, { "epoch": 2.0112205265016283, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.8241, "step": 6408 }, { "epoch": 2.0115343873827927, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.5197, "step": 6409 }, { "epoch": 2.011848248263957, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 1.5815, "step": 6410 }, { "epoch": 2.0121621091451214, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 1.6764, "step": 6411 }, { "epoch": 2.0124759700262858, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 1.8467, "step": 6412 }, { "epoch": 2.01278983090745, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 1.572, "step": 6413 }, { "epoch": 2.0131036917886145, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 1.4858, "step": 6414 }, { "epoch": 2.0134175526697793, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 1.422, "step": 6415 }, { "epoch": 2.0137314135509436, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 1.4264, "step": 6416 }, { "epoch": 2.014045274432108, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 1.5191, "step": 6417 }, { "epoch": 2.0143591353132724, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 1.4893, "step": 6418 }, { "epoch": 2.0146729961944367, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 1.6836, "step": 6419 }, { "epoch": 2.014986857075601, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 1.4381, "step": 6420 }, { "epoch": 2.0153007179567655, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 1.3994, "step": 6421 }, { "epoch": 2.0156145788379303, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 1.7388, "step": 6422 }, { "epoch": 2.0159284397190946, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 1.5368, "step": 6423 }, { "epoch": 2.016242300600259, "grad_norm": 0.224609375, "learning_rate": 0.0002, "loss": 1.3737, "step": 6424 }, { "epoch": 2.0165561614814234, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 1.3951, "step": 6425 }, { "epoch": 2.0168700223625877, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 1.273, "step": 6426 }, { "epoch": 2.017183883243752, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.2884, "step": 6427 }, { "epoch": 2.0174977441249164, "grad_norm": 0.236328125, "learning_rate": 0.0002, "loss": 1.2472, "step": 6428 }, { "epoch": 2.0178116050060813, "grad_norm": 0.2490234375, "learning_rate": 0.0002, "loss": 1.3831, "step": 6429 }, { "epoch": 2.0181254658872456, "grad_norm": 0.240234375, "learning_rate": 0.0002, "loss": 1.2233, "step": 6430 }, { "epoch": 2.01843932676841, "grad_norm": 0.2177734375, "learning_rate": 0.0002, "loss": 1.3126, "step": 6431 }, { "epoch": 2.0187531876495743, "grad_norm": 0.1953125, "learning_rate": 0.0002, "loss": 1.1973, "step": 6432 }, { "epoch": 2.0190670485307387, "grad_norm": 0.2333984375, "learning_rate": 0.0002, "loss": 1.2813, "step": 6433 }, { "epoch": 2.019380909411903, "grad_norm": 0.220703125, "learning_rate": 0.0002, "loss": 1.1946, "step": 6434 }, { "epoch": 2.0196947702930674, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.269, "step": 6435 }, { "epoch": 2.0200086311742322, "grad_norm": 0.1953125, "learning_rate": 0.0002, "loss": 1.2773, "step": 6436 }, { "epoch": 2.0203224920553966, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.1583, "step": 6437 }, { "epoch": 2.020636352936561, "grad_norm": 0.224609375, "learning_rate": 0.0002, "loss": 1.097, "step": 6438 }, { "epoch": 2.0209502138177253, "grad_norm": 0.208984375, "learning_rate": 0.0002, "loss": 1.1401, "step": 6439 }, { "epoch": 2.0212640746988897, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.1959, "step": 6440 }, { "epoch": 2.021577935580054, "grad_norm": 0.2158203125, "learning_rate": 0.0002, "loss": 1.1521, "step": 6441 }, { "epoch": 2.0218917964612184, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.1557, "step": 6442 }, { "epoch": 2.022205657342383, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.1005, "step": 6443 }, { "epoch": 2.0225195182235476, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.1871, "step": 6444 }, { "epoch": 2.022833379104712, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.0931, "step": 6445 }, { "epoch": 2.0231472399858763, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.1432, "step": 6446 }, { "epoch": 2.0234611008670407, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.1383, "step": 6447 }, { "epoch": 2.023774961748205, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.0328, "step": 6448 }, { "epoch": 2.0240888226293694, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 1.1496, "step": 6449 }, { "epoch": 2.0244026835105338, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.1561, "step": 6450 }, { "epoch": 2.0247165443916986, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 1.3405, "step": 6451 }, { "epoch": 2.025030405272863, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.1838, "step": 6452 }, { "epoch": 2.0253442661540273, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.5417, "step": 6453 }, { "epoch": 2.0256581270351917, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.3732, "step": 6454 }, { "epoch": 2.025971987916356, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 1.2126, "step": 6455 }, { "epoch": 2.0262858487975204, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.2923, "step": 6456 }, { "epoch": 2.0265997096786847, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 1.7343, "step": 6457 }, { "epoch": 2.0269135705598496, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 1.4832, "step": 6458 }, { "epoch": 2.027227431441014, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 1.3652, "step": 6459 }, { "epoch": 2.0275412923221783, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.3195, "step": 6460 }, { "epoch": 2.0278551532033426, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 1.5438, "step": 6461 }, { "epoch": 2.028169014084507, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 1.885, "step": 6462 }, { "epoch": 2.0284828749656714, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.3795, "step": 6463 }, { "epoch": 2.0287967358468357, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 1.4408, "step": 6464 }, { "epoch": 2.0291105967280005, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 1.591, "step": 6465 }, { "epoch": 2.029424457609165, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 2.3045, "step": 6466 }, { "epoch": 2.0297383184903293, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 1.4852, "step": 6467 }, { "epoch": 2.0300521793714936, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 1.5878, "step": 6468 }, { "epoch": 2.030366040252658, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 1.3267, "step": 6469 }, { "epoch": 2.0306799011338224, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.4204, "step": 6470 }, { "epoch": 2.0309937620149867, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 1.4928, "step": 6471 }, { "epoch": 2.031307622896151, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 2.0492, "step": 6472 }, { "epoch": 2.031621483777316, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.4144, "step": 6473 }, { "epoch": 2.0319353446584802, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.1082, "step": 6474 }, { "epoch": 2.0322492055396446, "grad_norm": 0.2021484375, "learning_rate": 0.0002, "loss": 1.2631, "step": 6475 }, { "epoch": 2.032563066420809, "grad_norm": 0.205078125, "learning_rate": 0.0002, "loss": 1.247, "step": 6476 }, { "epoch": 2.0328769273019733, "grad_norm": 0.20703125, "learning_rate": 0.0002, "loss": 1.3581, "step": 6477 }, { "epoch": 2.0331907881831377, "grad_norm": 0.2197265625, "learning_rate": 0.0002, "loss": 1.215, "step": 6478 }, { "epoch": 2.033504649064302, "grad_norm": 0.232421875, "learning_rate": 0.0002, "loss": 1.261, "step": 6479 }, { "epoch": 2.033818509945467, "grad_norm": 0.205078125, "learning_rate": 0.0002, "loss": 1.211, "step": 6480 }, { "epoch": 2.0341323708266312, "grad_norm": 0.216796875, "learning_rate": 0.0002, "loss": 1.1635, "step": 6481 }, { "epoch": 2.0344462317077956, "grad_norm": 0.2138671875, "learning_rate": 0.0002, "loss": 1.2252, "step": 6482 }, { "epoch": 2.03476009258896, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.1542, "step": 6483 }, { "epoch": 2.0350739534701243, "grad_norm": 0.1982421875, "learning_rate": 0.0002, "loss": 1.1, "step": 6484 }, { "epoch": 2.0353878143512887, "grad_norm": 0.232421875, "learning_rate": 0.0002, "loss": 1.224, "step": 6485 }, { "epoch": 2.035701675232453, "grad_norm": 0.193359375, "learning_rate": 0.0002, "loss": 1.205, "step": 6486 }, { "epoch": 2.036015536113618, "grad_norm": 0.2236328125, "learning_rate": 0.0002, "loss": 1.2151, "step": 6487 }, { "epoch": 2.036329396994782, "grad_norm": 0.2412109375, "learning_rate": 0.0002, "loss": 1.247, "step": 6488 }, { "epoch": 2.0366432578759466, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.3503, "step": 6489 }, { "epoch": 2.036957118757111, "grad_norm": 0.2314453125, "learning_rate": 0.0002, "loss": 1.2557, "step": 6490 }, { "epoch": 2.0372709796382753, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.0906, "step": 6491 }, { "epoch": 2.0375848405194397, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.1621, "step": 6492 }, { "epoch": 2.037898701400604, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.1906, "step": 6493 }, { "epoch": 2.0382125622817684, "grad_norm": 0.2451171875, "learning_rate": 0.0002, "loss": 1.1041, "step": 6494 }, { "epoch": 2.038526423162933, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.0958, "step": 6495 }, { "epoch": 2.0388402840440976, "grad_norm": 0.2373046875, "learning_rate": 0.0002, "loss": 0.9641, "step": 6496 }, { "epoch": 2.039154144925262, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.2378, "step": 6497 }, { "epoch": 2.0394680058064263, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 0.9743, "step": 6498 }, { "epoch": 2.0397818666875907, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.1066, "step": 6499 }, { "epoch": 2.040095727568755, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 1.1521, "step": 6500 }, { "epoch": 2.0404095884499194, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.2821, "step": 6501 }, { "epoch": 2.040723449331084, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.0824, "step": 6502 }, { "epoch": 2.0410373102122485, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.3349, "step": 6503 }, { "epoch": 2.041351171093413, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 1.1959, "step": 6504 }, { "epoch": 2.0416650319745773, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 1.5071, "step": 6505 }, { "epoch": 2.0419788928557416, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 1.4881, "step": 6506 }, { "epoch": 2.042292753736906, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 1.5681, "step": 6507 }, { "epoch": 2.0426066146180704, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 1.2219, "step": 6508 }, { "epoch": 2.042920475499235, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 1.6498, "step": 6509 }, { "epoch": 2.0432343363803995, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 1.3468, "step": 6510 }, { "epoch": 2.043548197261564, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.6375, "step": 6511 }, { "epoch": 2.0438620581427283, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.8956, "step": 6512 }, { "epoch": 2.0441759190238926, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 1.5777, "step": 6513 }, { "epoch": 2.044489779905057, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 1.9436, "step": 6514 }, { "epoch": 2.0448036407862213, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 1.5448, "step": 6515 }, { "epoch": 2.045117501667386, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 1.9716, "step": 6516 }, { "epoch": 2.0454313625485505, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 1.3166, "step": 6517 }, { "epoch": 2.045745223429715, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 1.3701, "step": 6518 }, { "epoch": 2.0460590843108792, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 1.8284, "step": 6519 }, { "epoch": 2.0463729451920436, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 1.585, "step": 6520 }, { "epoch": 2.046686806073208, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 1.7104, "step": 6521 }, { "epoch": 2.0470006669543723, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 1.7063, "step": 6522 }, { "epoch": 2.0473145278355367, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 1.4015, "step": 6523 }, { "epoch": 2.0476283887167015, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.2702, "step": 6524 }, { "epoch": 2.047942249597866, "grad_norm": 0.1943359375, "learning_rate": 0.0002, "loss": 1.3201, "step": 6525 }, { "epoch": 2.0482561104790302, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.3815, "step": 6526 }, { "epoch": 2.0485699713601946, "grad_norm": 0.2021484375, "learning_rate": 0.0002, "loss": 1.2473, "step": 6527 }, { "epoch": 2.048883832241359, "grad_norm": 0.20703125, "learning_rate": 0.0002, "loss": 1.2444, "step": 6528 }, { "epoch": 2.0491976931225233, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.1019, "step": 6529 }, { "epoch": 2.0495115540036877, "grad_norm": 0.20703125, "learning_rate": 0.0002, "loss": 1.4394, "step": 6530 }, { "epoch": 2.0498254148848525, "grad_norm": 0.189453125, "learning_rate": 0.0002, "loss": 1.2125, "step": 6531 }, { "epoch": 2.050139275766017, "grad_norm": 0.2099609375, "learning_rate": 0.0002, "loss": 1.2068, "step": 6532 }, { "epoch": 2.050453136647181, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.2661, "step": 6533 }, { "epoch": 2.0507669975283456, "grad_norm": 0.2236328125, "learning_rate": 0.0002, "loss": 1.2278, "step": 6534 }, { "epoch": 2.05108085840951, "grad_norm": 0.2138671875, "learning_rate": 0.0002, "loss": 1.1051, "step": 6535 }, { "epoch": 2.0513947192906743, "grad_norm": 0.2021484375, "learning_rate": 0.0002, "loss": 1.2101, "step": 6536 }, { "epoch": 2.0517085801718387, "grad_norm": 0.2314453125, "learning_rate": 0.0002, "loss": 1.1954, "step": 6537 }, { "epoch": 2.0520224410530035, "grad_norm": 0.208984375, "learning_rate": 0.0002, "loss": 1.1359, "step": 6538 }, { "epoch": 2.052336301934168, "grad_norm": 0.23828125, "learning_rate": 0.0002, "loss": 1.3137, "step": 6539 }, { "epoch": 2.052650162815332, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.3257, "step": 6540 }, { "epoch": 2.0529640236964966, "grad_norm": 0.244140625, "learning_rate": 0.0002, "loss": 1.135, "step": 6541 }, { "epoch": 2.053277884577661, "grad_norm": 0.248046875, "learning_rate": 0.0002, "loss": 1.3097, "step": 6542 }, { "epoch": 2.0535917454588253, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.1883, "step": 6543 }, { "epoch": 2.0539056063399896, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.0568, "step": 6544 }, { "epoch": 2.054219467221154, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.1958, "step": 6545 }, { "epoch": 2.054219467221154, "eval_loss": 1.843532919883728, "eval_runtime": 123.3514, "eval_samples_per_second": 8.107, "eval_steps_per_second": 8.107, "step": 6545 }, { "epoch": 2.054219467221154, "mmlu_eval_accuracy": 0.4295219152661749, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.4375, "mmlu_eval_accuracy_business_ethics": 0.7272727272727273, "mmlu_eval_accuracy_clinical_knowledge": 0.4482758620689655, "mmlu_eval_accuracy_college_biology": 0.25, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.45454545454545453, "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.34615384615384615, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.375, "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, "mmlu_eval_accuracy_formal_logic": 0.2857142857142857, "mmlu_eval_accuracy_global_facts": 0.4, "mmlu_eval_accuracy_high_school_biology": 0.3125, "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365, "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666, "mmlu_eval_accuracy_high_school_european_history": 0.6666666666666666, "mmlu_eval_accuracy_high_school_geography": 0.5454545454545454, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5238095238095238, "mmlu_eval_accuracy_high_school_macroeconomics": 0.37209302325581395, "mmlu_eval_accuracy_high_school_mathematics": 0.3103448275862069, "mmlu_eval_accuracy_high_school_microeconomics": 0.2692307692307692, "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, "mmlu_eval_accuracy_high_school_psychology": 0.6, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.5454545454545454, "mmlu_eval_accuracy_high_school_world_history": 0.46153846153846156, "mmlu_eval_accuracy_human_aging": 0.6956521739130435, "mmlu_eval_accuracy_human_sexuality": 0.3333333333333333, "mmlu_eval_accuracy_international_law": 0.7692307692307693, "mmlu_eval_accuracy_jurisprudence": 0.45454545454545453, "mmlu_eval_accuracy_logical_fallacies": 0.5555555555555556, "mmlu_eval_accuracy_machine_learning": 0.18181818181818182, "mmlu_eval_accuracy_management": 0.45454545454545453, "mmlu_eval_accuracy_marketing": 0.8, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.5232558139534884, "mmlu_eval_accuracy_moral_disputes": 0.4473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.28, "mmlu_eval_accuracy_nutrition": 0.36363636363636365, "mmlu_eval_accuracy_philosophy": 0.4117647058823529, "mmlu_eval_accuracy_prehistory": 0.3142857142857143, "mmlu_eval_accuracy_professional_accounting": 0.3548387096774194, "mmlu_eval_accuracy_professional_law": 0.3235294117647059, "mmlu_eval_accuracy_professional_medicine": 0.4838709677419355, "mmlu_eval_accuracy_professional_psychology": 0.42028985507246375, "mmlu_eval_accuracy_public_relations": 0.5, "mmlu_eval_accuracy_security_studies": 0.4074074074074074, "mmlu_eval_accuracy_sociology": 0.5, "mmlu_eval_accuracy_us_foreign_policy": 0.5454545454545454, "mmlu_eval_accuracy_virology": 0.3888888888888889, "mmlu_eval_accuracy_world_religions": 0.6842105263157895, "mmlu_loss": 1.3141378964146004, "step": 6545 }, { "epoch": 2.054533328102319, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.4153, "step": 6546 }, { "epoch": 2.054847188983483, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 1.1459, "step": 6547 }, { "epoch": 2.0551610498646475, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 1.0317, "step": 6548 }, { "epoch": 2.055474910745812, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 1.3588, "step": 6549 }, { "epoch": 2.0557887716269763, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 1.1621, "step": 6550 }, { "epoch": 2.0561026325081406, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 1.1384, "step": 6551 }, { "epoch": 2.056416493389305, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.1263, "step": 6552 }, { "epoch": 2.05673035427047, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 1.4628, "step": 6553 }, { "epoch": 2.057044215151634, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 1.399, "step": 6554 }, { "epoch": 2.0573580760327985, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.04, "step": 6555 }, { "epoch": 2.057671936913963, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.4969, "step": 6556 }, { "epoch": 2.0579857977951272, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 1.2983, "step": 6557 }, { "epoch": 2.0582996586762916, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 1.3996, "step": 6558 }, { "epoch": 2.058613519557456, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 1.6716, "step": 6559 }, { "epoch": 2.058927380438621, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 1.2771, "step": 6560 }, { "epoch": 2.059241241319785, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 1.5109, "step": 6561 }, { "epoch": 2.0595551022009495, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.5093, "step": 6562 }, { "epoch": 2.059868963082114, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 1.3985, "step": 6563 }, { "epoch": 2.0601828239632782, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 1.4276, "step": 6564 }, { "epoch": 2.0604966848444426, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 1.6134, "step": 6565 }, { "epoch": 2.060810545725607, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 1.8538, "step": 6566 }, { "epoch": 2.0611244066067718, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 1.8285, "step": 6567 }, { "epoch": 2.061438267487936, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 1.8899, "step": 6568 }, { "epoch": 2.0617521283691005, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 1.6118, "step": 6569 }, { "epoch": 2.062065989250265, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 1.4798, "step": 6570 }, { "epoch": 2.062379850131429, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 1.3975, "step": 6571 }, { "epoch": 2.0626937110125936, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 1.5737, "step": 6572 }, { "epoch": 2.063007571893758, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.482, "step": 6573 }, { "epoch": 2.0633214327749223, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 1.2845, "step": 6574 }, { "epoch": 2.063635293656087, "grad_norm": 0.234375, "learning_rate": 0.0002, "loss": 1.296, "step": 6575 }, { "epoch": 2.0639491545372515, "grad_norm": 0.2099609375, "learning_rate": 0.0002, "loss": 1.2488, "step": 6576 }, { "epoch": 2.064263015418416, "grad_norm": 0.224609375, "learning_rate": 0.0002, "loss": 1.2408, "step": 6577 }, { "epoch": 2.06457687629958, "grad_norm": 0.2275390625, "learning_rate": 0.0002, "loss": 1.1978, "step": 6578 }, { "epoch": 2.0648907371807446, "grad_norm": 0.2236328125, "learning_rate": 0.0002, "loss": 1.3885, "step": 6579 }, { "epoch": 2.065204598061909, "grad_norm": 0.2158203125, "learning_rate": 0.0002, "loss": 1.2603, "step": 6580 }, { "epoch": 2.0655184589430733, "grad_norm": 0.2001953125, "learning_rate": 0.0002, "loss": 1.2214, "step": 6581 }, { "epoch": 2.065832319824238, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 1.1807, "step": 6582 }, { "epoch": 2.0661461807054025, "grad_norm": 0.208984375, "learning_rate": 0.0002, "loss": 1.1363, "step": 6583 }, { "epoch": 2.066460041586567, "grad_norm": 0.2333984375, "learning_rate": 0.0002, "loss": 1.3298, "step": 6584 }, { "epoch": 2.066773902467731, "grad_norm": 0.2158203125, "learning_rate": 0.0002, "loss": 1.1943, "step": 6585 }, { "epoch": 2.0670877633488955, "grad_norm": 0.2021484375, "learning_rate": 0.0002, "loss": 1.1347, "step": 6586 }, { "epoch": 2.06740162423006, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.1102, "step": 6587 }, { "epoch": 2.0677154851112243, "grad_norm": 0.232421875, "learning_rate": 0.0002, "loss": 1.0893, "step": 6588 }, { "epoch": 2.068029345992389, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.1005, "step": 6589 }, { "epoch": 2.0683432068735534, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.2608, "step": 6590 }, { "epoch": 2.068657067754718, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.1202, "step": 6591 }, { "epoch": 2.068970928635882, "grad_norm": 0.2392578125, "learning_rate": 0.0002, "loss": 1.2543, "step": 6592 }, { "epoch": 2.0692847895170465, "grad_norm": 0.232421875, "learning_rate": 0.0002, "loss": 0.998, "step": 6593 }, { "epoch": 2.069598650398211, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.0883, "step": 6594 }, { "epoch": 2.0699125112793753, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 1.1443, "step": 6595 }, { "epoch": 2.0702263721605396, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.0458, "step": 6596 }, { "epoch": 2.0705402330417044, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 1.285, "step": 6597 }, { "epoch": 2.070854093922869, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 1.2554, "step": 6598 }, { "epoch": 2.071167954804033, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 1.3008, "step": 6599 }, { "epoch": 2.0714818156851975, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 1.1847, "step": 6600 }, { "epoch": 2.071795676566362, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 1.0627, "step": 6601 }, { "epoch": 2.0721095374475262, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 1.1344, "step": 6602 }, { "epoch": 2.0724233983286906, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 1.2701, "step": 6603 }, { "epoch": 2.0727372592098554, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 1.4701, "step": 6604 }, { "epoch": 2.0730511200910198, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 1.1509, "step": 6605 }, { "epoch": 2.073364980972184, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 1.4165, "step": 6606 }, { "epoch": 2.0736788418533485, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.3624, "step": 6607 }, { "epoch": 2.073992702734513, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 1.6507, "step": 6608 }, { "epoch": 2.0743065636156772, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 1.6727, "step": 6609 }, { "epoch": 2.0746204244968416, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 1.6514, "step": 6610 }, { "epoch": 2.0749342853780064, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.4112, "step": 6611 }, { "epoch": 2.0752481462591708, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 1.4614, "step": 6612 }, { "epoch": 2.075562007140335, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 1.3049, "step": 6613 }, { "epoch": 2.0758758680214995, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 1.5404, "step": 6614 }, { "epoch": 2.076189728902664, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 2.244, "step": 6615 }, { "epoch": 2.076503589783828, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 1.6382, "step": 6616 }, { "epoch": 2.0768174506649926, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.4715, "step": 6617 }, { "epoch": 2.0771313115461574, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.3005, "step": 6618 }, { "epoch": 2.0774451724273217, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 1.6693, "step": 6619 }, { "epoch": 2.077759033308486, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 1.3043, "step": 6620 }, { "epoch": 2.0780728941896505, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.5473, "step": 6621 }, { "epoch": 2.078386755070815, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 1.8934, "step": 6622 }, { "epoch": 2.078700615951979, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.5022, "step": 6623 }, { "epoch": 2.0790144768331436, "grad_norm": 0.193359375, "learning_rate": 0.0002, "loss": 1.1386, "step": 6624 }, { "epoch": 2.0793283377143084, "grad_norm": 0.20703125, "learning_rate": 0.0002, "loss": 1.226, "step": 6625 }, { "epoch": 2.0796421985954727, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.3001, "step": 6626 }, { "epoch": 2.079956059476637, "grad_norm": 0.2373046875, "learning_rate": 0.0002, "loss": 1.1973, "step": 6627 }, { "epoch": 2.0802699203578014, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 1.3712, "step": 6628 }, { "epoch": 2.080583781238966, "grad_norm": 0.24609375, "learning_rate": 0.0002, "loss": 1.4416, "step": 6629 }, { "epoch": 2.08089764212013, "grad_norm": 0.2119140625, "learning_rate": 0.0002, "loss": 1.2155, "step": 6630 }, { "epoch": 2.0812115030012945, "grad_norm": 0.240234375, "learning_rate": 0.0002, "loss": 1.3736, "step": 6631 }, { "epoch": 2.081525363882459, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.2758, "step": 6632 }, { "epoch": 2.0818392247636237, "grad_norm": 0.2392578125, "learning_rate": 0.0002, "loss": 1.352, "step": 6633 }, { "epoch": 2.082153085644788, "grad_norm": 0.2099609375, "learning_rate": 0.0002, "loss": 1.1818, "step": 6634 }, { "epoch": 2.0824669465259524, "grad_norm": 0.220703125, "learning_rate": 0.0002, "loss": 1.1351, "step": 6635 }, { "epoch": 2.082780807407117, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 1.1143, "step": 6636 }, { "epoch": 2.083094668288281, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 1.2945, "step": 6637 }, { "epoch": 2.0834085291694455, "grad_norm": 0.24609375, "learning_rate": 0.0002, "loss": 1.1505, "step": 6638 }, { "epoch": 2.08372239005061, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.1879, "step": 6639 }, { "epoch": 2.0840362509317747, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.1508, "step": 6640 }, { "epoch": 2.084350111812939, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 1.2983, "step": 6641 }, { "epoch": 2.0846639726941034, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.1299, "step": 6642 }, { "epoch": 2.084977833575268, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 1.2565, "step": 6643 }, { "epoch": 2.085291694456432, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.112, "step": 6644 }, { "epoch": 2.0856055553375965, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.3483, "step": 6645 }, { "epoch": 2.085919416218761, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.1211, "step": 6646 }, { "epoch": 2.0862332770999252, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.0277, "step": 6647 }, { "epoch": 2.08654713798109, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.2961, "step": 6648 }, { "epoch": 2.0868609988622544, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.2166, "step": 6649 }, { "epoch": 2.0871748597434188, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 1.1088, "step": 6650 }, { "epoch": 2.087488720624583, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 1.2482, "step": 6651 }, { "epoch": 2.0878025815057475, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 1.2477, "step": 6652 }, { "epoch": 2.088116442386912, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 1.3587, "step": 6653 }, { "epoch": 2.088430303268076, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.4881, "step": 6654 }, { "epoch": 2.088744164149241, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 1.436, "step": 6655 }, { "epoch": 2.0890580250304054, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 1.517, "step": 6656 }, { "epoch": 2.0893718859115697, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.6483, "step": 6657 }, { "epoch": 2.089685746792734, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 1.2482, "step": 6658 }, { "epoch": 2.0899996076738985, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.3802, "step": 6659 }, { "epoch": 2.090313468555063, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 1.4399, "step": 6660 }, { "epoch": 2.090627329436227, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 1.6742, "step": 6661 }, { "epoch": 2.090941190317392, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.3579, "step": 6662 }, { "epoch": 2.0912550511985564, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 1.8549, "step": 6663 }, { "epoch": 2.0915689120797207, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 1.7403, "step": 6664 }, { "epoch": 2.091882772960885, "grad_norm": 0.9765625, "learning_rate": 0.0002, "loss": 1.4915, "step": 6665 }, { "epoch": 2.0921966338420495, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 1.6108, "step": 6666 }, { "epoch": 2.092510494723214, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 1.7772, "step": 6667 }, { "epoch": 2.092824355604378, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 1.3944, "step": 6668 }, { "epoch": 2.093138216485543, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 1.5062, "step": 6669 }, { "epoch": 2.0934520773667074, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 1.2579, "step": 6670 }, { "epoch": 2.0937659382478717, "grad_norm": 1.203125, "learning_rate": 0.0002, "loss": 1.3374, "step": 6671 }, { "epoch": 2.094079799129036, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 2.0661, "step": 6672 }, { "epoch": 2.0943936600102004, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.3999, "step": 6673 }, { "epoch": 2.094707520891365, "grad_norm": 0.203125, "learning_rate": 0.0002, "loss": 1.232, "step": 6674 }, { "epoch": 2.095021381772529, "grad_norm": 0.2001953125, "learning_rate": 0.0002, "loss": 1.3604, "step": 6675 }, { "epoch": 2.095335242653694, "grad_norm": 0.1982421875, "learning_rate": 0.0002, "loss": 1.2852, "step": 6676 }, { "epoch": 2.0956491035348583, "grad_norm": 0.212890625, "learning_rate": 0.0002, "loss": 1.1618, "step": 6677 }, { "epoch": 2.0959629644160227, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 1.1367, "step": 6678 }, { "epoch": 2.096276825297187, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.2307, "step": 6679 }, { "epoch": 2.0965906861783514, "grad_norm": 0.234375, "learning_rate": 0.0002, "loss": 1.3044, "step": 6680 }, { "epoch": 2.096904547059516, "grad_norm": 0.234375, "learning_rate": 0.0002, "loss": 1.1697, "step": 6681 }, { "epoch": 2.09721840794068, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 1.1658, "step": 6682 }, { "epoch": 2.0975322688218445, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.3785, "step": 6683 }, { "epoch": 2.0978461297030093, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.3131, "step": 6684 }, { "epoch": 2.0981599905841737, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.1546, "step": 6685 }, { "epoch": 2.098473851465338, "grad_norm": 0.24609375, "learning_rate": 0.0002, "loss": 1.1966, "step": 6686 }, { "epoch": 2.0987877123465024, "grad_norm": 0.20703125, "learning_rate": 0.0002, "loss": 1.0451, "step": 6687 }, { "epoch": 2.0991015732276668, "grad_norm": 0.232421875, "learning_rate": 0.0002, "loss": 1.2748, "step": 6688 }, { "epoch": 2.099415434108831, "grad_norm": 0.2236328125, "learning_rate": 0.0002, "loss": 1.122, "step": 6689 }, { "epoch": 2.0997292949899955, "grad_norm": 0.2314453125, "learning_rate": 0.0002, "loss": 1.0628, "step": 6690 }, { "epoch": 2.1000431558711603, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.091, "step": 6691 }, { "epoch": 2.1003570167523247, "grad_norm": 0.24609375, "learning_rate": 0.0002, "loss": 1.17, "step": 6692 }, { "epoch": 2.100670877633489, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.2284, "step": 6693 }, { "epoch": 2.1009847385146534, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 1.2128, "step": 6694 }, { "epoch": 2.1012985993958178, "grad_norm": 0.2373046875, "learning_rate": 0.0002, "loss": 0.9812, "step": 6695 }, { "epoch": 2.101612460276982, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.0904, "step": 6696 }, { "epoch": 2.1019263211581465, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.1386, "step": 6697 }, { "epoch": 2.1022401820393113, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.2994, "step": 6698 }, { "epoch": 2.1025540429204757, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 1.1322, "step": 6699 }, { "epoch": 2.10286790380164, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.1264, "step": 6700 }, { "epoch": 2.1031817646828044, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 1.6199, "step": 6701 }, { "epoch": 2.1034956255639687, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 1.6834, "step": 6702 }, { "epoch": 2.103809486445133, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 1.283, "step": 6703 }, { "epoch": 2.1041233473262975, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.4683, "step": 6704 }, { "epoch": 2.104437208207462, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.4372, "step": 6705 }, { "epoch": 2.1047510690886266, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 1.4158, "step": 6706 }, { "epoch": 2.105064929969791, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.4875, "step": 6707 }, { "epoch": 2.1053787908509554, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 1.571, "step": 6708 }, { "epoch": 2.1056926517321197, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 1.564, "step": 6709 }, { "epoch": 2.106006512613284, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 1.6821, "step": 6710 }, { "epoch": 2.1063203734944485, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 1.6252, "step": 6711 }, { "epoch": 2.106634234375613, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.2611, "step": 6712 }, { "epoch": 2.1069480952567776, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 1.285, "step": 6713 }, { "epoch": 2.107261956137942, "grad_norm": 0.95703125, "learning_rate": 0.0002, "loss": 1.5855, "step": 6714 }, { "epoch": 2.1075758170191063, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 1.8855, "step": 6715 }, { "epoch": 2.1078896779002707, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 1.5941, "step": 6716 }, { "epoch": 2.108203538781435, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 1.6059, "step": 6717 }, { "epoch": 2.1085173996625994, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 1.7762, "step": 6718 }, { "epoch": 2.108831260543764, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.3035, "step": 6719 }, { "epoch": 2.1091451214249286, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 1.5713, "step": 6720 }, { "epoch": 2.109458982306093, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 1.6413, "step": 6721 }, { "epoch": 2.1097728431872573, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 2.1769, "step": 6722 }, { "epoch": 2.1100867040684217, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.7023, "step": 6723 }, { "epoch": 2.110400564949586, "grad_norm": 0.1953125, "learning_rate": 0.0002, "loss": 1.2982, "step": 6724 }, { "epoch": 2.1107144258307504, "grad_norm": 0.2041015625, "learning_rate": 0.0002, "loss": 1.1938, "step": 6725 }, { "epoch": 2.111028286711915, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 1.4027, "step": 6726 }, { "epoch": 2.1113421475930796, "grad_norm": 0.2177734375, "learning_rate": 0.0002, "loss": 1.1081, "step": 6727 }, { "epoch": 2.111656008474244, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.1019, "step": 6728 }, { "epoch": 2.1119698693554083, "grad_norm": 0.2490234375, "learning_rate": 0.0002, "loss": 1.3549, "step": 6729 }, { "epoch": 2.1122837302365727, "grad_norm": 0.2314453125, "learning_rate": 0.0002, "loss": 1.2879, "step": 6730 }, { "epoch": 2.112597591117737, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 1.2641, "step": 6731 }, { "epoch": 2.1129114519989014, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 1.1367, "step": 6732 }, { "epoch": 2.1129114519989014, "eval_loss": 1.8845981359481812, "eval_runtime": 123.4066, "eval_samples_per_second": 8.103, "eval_steps_per_second": 8.103, "step": 6732 }, { "epoch": 2.1129114519989014, "mmlu_eval_accuracy": 0.4160017211600709, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.4375, "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, "mmlu_eval_accuracy_clinical_knowledge": 0.4482758620689655, "mmlu_eval_accuracy_college_biology": 0.25, "mmlu_eval_accuracy_college_chemistry": 0.0, "mmlu_eval_accuracy_college_computer_science": 0.36363636363636365, "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.2727272727272727, "mmlu_eval_accuracy_conceptual_physics": 0.46153846153846156, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.375, "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, "mmlu_eval_accuracy_formal_logic": 0.2857142857142857, "mmlu_eval_accuracy_global_facts": 0.4, "mmlu_eval_accuracy_high_school_biology": 0.28125, "mmlu_eval_accuracy_high_school_chemistry": 0.3181818181818182, "mmlu_eval_accuracy_high_school_computer_science": 0.4444444444444444, "mmlu_eval_accuracy_high_school_european_history": 0.6666666666666666, "mmlu_eval_accuracy_high_school_geography": 0.5909090909090909, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5714285714285714, "mmlu_eval_accuracy_high_school_macroeconomics": 0.37209302325581395, "mmlu_eval_accuracy_high_school_mathematics": 0.27586206896551724, "mmlu_eval_accuracy_high_school_microeconomics": 0.2692307692307692, "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, "mmlu_eval_accuracy_high_school_psychology": 0.65, "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, "mmlu_eval_accuracy_high_school_us_history": 0.5454545454545454, "mmlu_eval_accuracy_high_school_world_history": 0.5, "mmlu_eval_accuracy_human_aging": 0.6956521739130435, "mmlu_eval_accuracy_human_sexuality": 0.25, "mmlu_eval_accuracy_international_law": 0.8461538461538461, "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, "mmlu_eval_accuracy_logical_fallacies": 0.5555555555555556, "mmlu_eval_accuracy_machine_learning": 0.09090909090909091, "mmlu_eval_accuracy_management": 0.45454545454545453, "mmlu_eval_accuracy_marketing": 0.72, "mmlu_eval_accuracy_medical_genetics": 0.6363636363636364, "mmlu_eval_accuracy_miscellaneous": 0.5232558139534884, "mmlu_eval_accuracy_moral_disputes": 0.4473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.23, "mmlu_eval_accuracy_nutrition": 0.45454545454545453, "mmlu_eval_accuracy_philosophy": 0.5, "mmlu_eval_accuracy_prehistory": 0.3142857142857143, "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, "mmlu_eval_accuracy_professional_law": 0.3058823529411765, "mmlu_eval_accuracy_professional_medicine": 0.3870967741935484, "mmlu_eval_accuracy_professional_psychology": 0.42028985507246375, "mmlu_eval_accuracy_public_relations": 0.5833333333333334, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.5, "mmlu_eval_accuracy_us_foreign_policy": 0.6363636363636364, "mmlu_eval_accuracy_virology": 0.3333333333333333, "mmlu_eval_accuracy_world_religions": 0.631578947368421, "mmlu_loss": 1.1772874047813036, "step": 6732 }, { "epoch": 2.1132253128800658, "grad_norm": 0.220703125, "learning_rate": 0.0002, "loss": 1.15, "step": 6733 }, { "epoch": 2.11353917376123, "grad_norm": 0.244140625, "learning_rate": 0.0002, "loss": 1.1879, "step": 6734 }, { "epoch": 2.113853034642395, "grad_norm": 0.23828125, "learning_rate": 0.0002, "loss": 1.2414, "step": 6735 }, { "epoch": 2.1141668955235593, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.1459, "step": 6736 }, { "epoch": 2.1144807564047237, "grad_norm": 0.216796875, "learning_rate": 0.0002, "loss": 1.0652, "step": 6737 }, { "epoch": 2.114794617285888, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.3751, "step": 6738 }, { "epoch": 2.1151084781670524, "grad_norm": 0.2412109375, "learning_rate": 0.0002, "loss": 1.1655, "step": 6739 }, { "epoch": 2.1154223390482167, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.1875, "step": 6740 }, { "epoch": 2.115736199929381, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.1378, "step": 6741 }, { "epoch": 2.116050060810546, "grad_norm": 0.2216796875, "learning_rate": 0.0002, "loss": 1.0022, "step": 6742 }, { "epoch": 2.1163639216917103, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.1581, "step": 6743 }, { "epoch": 2.1166777825728746, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.117, "step": 6744 }, { "epoch": 2.116991643454039, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.2402, "step": 6745 }, { "epoch": 2.1173055043352034, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.2288, "step": 6746 }, { "epoch": 2.1176193652163677, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.056, "step": 6747 }, { "epoch": 2.117933226097532, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.3126, "step": 6748 }, { "epoch": 2.118247086978697, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.049, "step": 6749 }, { "epoch": 2.1185609478598613, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.1198, "step": 6750 }, { "epoch": 2.1188748087410256, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 1.2, "step": 6751 }, { "epoch": 2.11918866962219, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 1.5001, "step": 6752 }, { "epoch": 2.1195025305033544, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 1.1871, "step": 6753 }, { "epoch": 2.1198163913845187, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 1.4034, "step": 6754 }, { "epoch": 2.120130252265683, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 1.3546, "step": 6755 }, { "epoch": 2.1204441131468474, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 1.7463, "step": 6756 }, { "epoch": 2.1207579740280122, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 1.5346, "step": 6757 }, { "epoch": 2.1210718349091766, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 1.2616, "step": 6758 }, { "epoch": 2.121385695790341, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 1.5472, "step": 6759 }, { "epoch": 2.1216995566715053, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.5406, "step": 6760 }, { "epoch": 2.1220134175526697, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 1.3958, "step": 6761 }, { "epoch": 2.122327278433834, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 1.6495, "step": 6762 }, { "epoch": 2.1226411393149984, "grad_norm": 0.97265625, "learning_rate": 0.0002, "loss": 1.7449, "step": 6763 }, { "epoch": 2.1229550001961632, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 1.8357, "step": 6764 }, { "epoch": 2.1232688610773276, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 1.804, "step": 6765 }, { "epoch": 2.123582721958492, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 1.5576, "step": 6766 }, { "epoch": 2.1238965828396563, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 1.4532, "step": 6767 }, { "epoch": 2.1242104437208207, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 1.497, "step": 6768 }, { "epoch": 2.124524304601985, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.3321, "step": 6769 }, { "epoch": 2.1248381654831494, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 1.3592, "step": 6770 }, { "epoch": 2.125152026364314, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 1.8384, "step": 6771 }, { "epoch": 2.1254658872454786, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 2.0317, "step": 6772 }, { "epoch": 2.125779748126643, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.875, "step": 6773 }, { "epoch": 2.1260936090078073, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.2051, "step": 6774 }, { "epoch": 2.1264074698889717, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.3364, "step": 6775 }, { "epoch": 2.126721330770136, "grad_norm": 0.20703125, "learning_rate": 0.0002, "loss": 1.292, "step": 6776 }, { "epoch": 2.1270351916513004, "grad_norm": 0.208984375, "learning_rate": 0.0002, "loss": 1.1784, "step": 6777 }, { "epoch": 2.127349052532465, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 1.1701, "step": 6778 }, { "epoch": 2.1276629134136296, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.2025, "step": 6779 }, { "epoch": 2.127976774294794, "grad_norm": 0.23828125, "learning_rate": 0.0002, "loss": 1.2073, "step": 6780 }, { "epoch": 2.1282906351759583, "grad_norm": 0.2138671875, "learning_rate": 0.0002, "loss": 1.1526, "step": 6781 }, { "epoch": 2.1286044960571227, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.2911, "step": 6782 }, { "epoch": 2.128918356938287, "grad_norm": 0.2451171875, "learning_rate": 0.0002, "loss": 1.2722, "step": 6783 }, { "epoch": 2.1292322178194514, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.0901, "step": 6784 }, { "epoch": 2.129546078700616, "grad_norm": 0.2255859375, "learning_rate": 0.0002, "loss": 1.1132, "step": 6785 }, { "epoch": 2.1298599395817805, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.1682, "step": 6786 }, { "epoch": 2.130173800462945, "grad_norm": 0.244140625, "learning_rate": 0.0002, "loss": 1.2096, "step": 6787 }, { "epoch": 2.1304876613441093, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 1.3941, "step": 6788 }, { "epoch": 2.1308015222252736, "grad_norm": 0.2275390625, "learning_rate": 0.0002, "loss": 1.0099, "step": 6789 }, { "epoch": 2.131115383106438, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.2074, "step": 6790 }, { "epoch": 2.1314292439876024, "grad_norm": 0.248046875, "learning_rate": 0.0002, "loss": 1.114, "step": 6791 }, { "epoch": 2.1317431048687667, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.1693, "step": 6792 }, { "epoch": 2.1320569657499315, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.1457, "step": 6793 }, { "epoch": 2.132370826631096, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.1432, "step": 6794 }, { "epoch": 2.1326846875122603, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.1823, "step": 6795 }, { "epoch": 2.1329985483934246, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 1.2051, "step": 6796 }, { "epoch": 2.133312409274589, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.1095, "step": 6797 }, { "epoch": 2.1336262701557533, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.0579, "step": 6798 }, { "epoch": 2.1339401310369177, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 1.4059, "step": 6799 }, { "epoch": 2.1342539919180825, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 1.3724, "step": 6800 }, { "epoch": 2.134567852799247, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 1.3845, "step": 6801 }, { "epoch": 2.1348817136804112, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 1.3795, "step": 6802 }, { "epoch": 2.1351955745615756, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 1.3792, "step": 6803 }, { "epoch": 2.13550943544274, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 1.271, "step": 6804 }, { "epoch": 2.1358232963239043, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.3843, "step": 6805 }, { "epoch": 2.1361371572050687, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.2448, "step": 6806 }, { "epoch": 2.136451018086233, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.5088, "step": 6807 }, { "epoch": 2.136764878967398, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 1.6733, "step": 6808 }, { "epoch": 2.1370787398485622, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.533, "step": 6809 }, { "epoch": 2.1373926007297266, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 1.6491, "step": 6810 }, { "epoch": 2.137706461610891, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 1.5106, "step": 6811 }, { "epoch": 2.1380203224920553, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.5473, "step": 6812 }, { "epoch": 2.1383341833732197, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 1.2842, "step": 6813 }, { "epoch": 2.138648044254384, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 1.8684, "step": 6814 }, { "epoch": 2.138961905135549, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.5224, "step": 6815 }, { "epoch": 2.139275766016713, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 1.7703, "step": 6816 }, { "epoch": 2.1395896268978776, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 1.9016, "step": 6817 }, { "epoch": 2.139903487779042, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 1.7186, "step": 6818 }, { "epoch": 2.1402173486602063, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 1.9751, "step": 6819 }, { "epoch": 2.1405312095413707, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 1.6175, "step": 6820 }, { "epoch": 2.140845070422535, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.618, "step": 6821 }, { "epoch": 2.1411589313037, "grad_norm": 0.87109375, "learning_rate": 0.0002, "loss": 2.2178, "step": 6822 }, { "epoch": 2.141472792184864, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 1.6464, "step": 6823 }, { "epoch": 2.1417866530660286, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.2822, "step": 6824 }, { "epoch": 2.142100513947193, "grad_norm": 0.2041015625, "learning_rate": 0.0002, "loss": 1.1868, "step": 6825 }, { "epoch": 2.1424143748283573, "grad_norm": 0.197265625, "learning_rate": 0.0002, "loss": 1.1706, "step": 6826 }, { "epoch": 2.1427282357095216, "grad_norm": 0.203125, "learning_rate": 0.0002, "loss": 1.2724, "step": 6827 }, { "epoch": 2.143042096590686, "grad_norm": 0.1875, "learning_rate": 0.0002, "loss": 1.2537, "step": 6828 }, { "epoch": 2.143355957471851, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 1.5385, "step": 6829 }, { "epoch": 2.143669818353015, "grad_norm": 0.240234375, "learning_rate": 0.0002, "loss": 1.3214, "step": 6830 }, { "epoch": 2.1439836792341795, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.296, "step": 6831 }, { "epoch": 2.144297540115344, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.278, "step": 6832 }, { "epoch": 2.1446114009965083, "grad_norm": 0.228515625, "learning_rate": 0.0002, "loss": 1.2616, "step": 6833 }, { "epoch": 2.1449252618776726, "grad_norm": 0.23828125, "learning_rate": 0.0002, "loss": 1.225, "step": 6834 }, { "epoch": 2.145239122758837, "grad_norm": 0.2392578125, "learning_rate": 0.0002, "loss": 1.1737, "step": 6835 }, { "epoch": 2.145552983640002, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.2207, "step": 6836 }, { "epoch": 2.145866844521166, "grad_norm": 0.2314453125, "learning_rate": 0.0002, "loss": 1.1458, "step": 6837 }, { "epoch": 2.1461807054023305, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.2683, "step": 6838 }, { "epoch": 2.146494566283495, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.1668, "step": 6839 }, { "epoch": 2.1468084271646592, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.2434, "step": 6840 }, { "epoch": 2.1471222880458236, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.1644, "step": 6841 }, { "epoch": 2.147436148926988, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 1.3738, "step": 6842 }, { "epoch": 2.1477500098081523, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.2088, "step": 6843 }, { "epoch": 2.148063870689317, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.3195, "step": 6844 }, { "epoch": 2.1483777315704815, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.2277, "step": 6845 }, { "epoch": 2.148691592451646, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.1324, "step": 6846 }, { "epoch": 2.1490054533328102, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 1.149, "step": 6847 }, { "epoch": 2.1493193142139746, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 1.3268, "step": 6848 }, { "epoch": 2.149633175095139, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 1.1116, "step": 6849 }, { "epoch": 2.1499470359763033, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 1.118, "step": 6850 }, { "epoch": 2.150260896857468, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 1.6066, "step": 6851 }, { "epoch": 2.1505747577386325, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.2709, "step": 6852 }, { "epoch": 2.150888618619797, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.3302, "step": 6853 }, { "epoch": 2.151202479500961, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 1.3063, "step": 6854 }, { "epoch": 2.1515163403821256, "grad_norm": 0.87890625, "learning_rate": 0.0002, "loss": 1.6667, "step": 6855 }, { "epoch": 2.15183020126329, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 1.4132, "step": 6856 }, { "epoch": 2.1521440621444543, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 1.4771, "step": 6857 }, { "epoch": 2.1524579230256187, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 1.428, "step": 6858 }, { "epoch": 2.1527717839067835, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 1.6305, "step": 6859 }, { "epoch": 2.153085644787948, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.5886, "step": 6860 }, { "epoch": 2.153399505669112, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.5487, "step": 6861 }, { "epoch": 2.1537133665502766, "grad_norm": 0.93359375, "learning_rate": 0.0002, "loss": 2.0663, "step": 6862 }, { "epoch": 2.154027227431441, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 1.6055, "step": 6863 }, { "epoch": 2.1543410883126053, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 1.6957, "step": 6864 }, { "epoch": 2.1546549491937697, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.2985, "step": 6865 }, { "epoch": 2.1549688100749345, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 2.0634, "step": 6866 }, { "epoch": 2.155282670956099, "grad_norm": 1.4140625, "learning_rate": 0.0002, "loss": 1.799, "step": 6867 }, { "epoch": 2.155596531837263, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 1.7505, "step": 6868 }, { "epoch": 2.1559103927184275, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 1.3157, "step": 6869 }, { "epoch": 2.156224253599592, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 1.5125, "step": 6870 }, { "epoch": 2.1565381144807563, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 1.4366, "step": 6871 }, { "epoch": 2.1568519753619206, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 2.0376, "step": 6872 }, { "epoch": 2.1571658362430854, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 1.9673, "step": 6873 }, { "epoch": 2.15747969712425, "grad_norm": 0.1923828125, "learning_rate": 0.0002, "loss": 1.1919, "step": 6874 }, { "epoch": 2.157793558005414, "grad_norm": 0.212890625, "learning_rate": 0.0002, "loss": 1.3039, "step": 6875 }, { "epoch": 2.1581074188865785, "grad_norm": 0.240234375, "learning_rate": 0.0002, "loss": 1.2509, "step": 6876 }, { "epoch": 2.158421279767743, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.2111, "step": 6877 }, { "epoch": 2.1587351406489073, "grad_norm": 0.2158203125, "learning_rate": 0.0002, "loss": 1.3505, "step": 6878 }, { "epoch": 2.1590490015300716, "grad_norm": 0.21484375, "learning_rate": 0.0002, "loss": 1.1955, "step": 6879 }, { "epoch": 2.1593628624112364, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.3208, "step": 6880 }, { "epoch": 2.159676723292401, "grad_norm": 0.2314453125, "learning_rate": 0.0002, "loss": 1.177, "step": 6881 }, { "epoch": 2.159990584173565, "grad_norm": 0.22265625, "learning_rate": 0.0002, "loss": 1.1524, "step": 6882 }, { "epoch": 2.1603044450547295, "grad_norm": 0.2451171875, "learning_rate": 0.0002, "loss": 1.1777, "step": 6883 }, { "epoch": 2.160618305935894, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.2514, "step": 6884 }, { "epoch": 2.1609321668170582, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 1.2055, "step": 6885 }, { "epoch": 2.1612460276982226, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.1553, "step": 6886 }, { "epoch": 2.1615598885793874, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.2612, "step": 6887 }, { "epoch": 2.1618737494605518, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.1441, "step": 6888 }, { "epoch": 2.162187610341716, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.1753, "step": 6889 }, { "epoch": 2.1625014712228805, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.1841, "step": 6890 }, { "epoch": 2.162815332104045, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.0857, "step": 6891 }, { "epoch": 2.1631291929852092, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 1.3838, "step": 6892 }, { "epoch": 2.1634430538663736, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 1.4182, "step": 6893 }, { "epoch": 2.163756914747538, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 1.3201, "step": 6894 }, { "epoch": 2.1640707756287028, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 0.9924, "step": 6895 }, { "epoch": 2.164384636509867, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 1.1636, "step": 6896 }, { "epoch": 2.1646984973910315, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 1.1477, "step": 6897 }, { "epoch": 2.165012358272196, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 1.147, "step": 6898 }, { "epoch": 2.16532621915336, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 1.2233, "step": 6899 }, { "epoch": 2.1656400800345246, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 1.4101, "step": 6900 }, { "epoch": 2.165953940915689, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.0818, "step": 6901 }, { "epoch": 2.1662678017968537, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 1.5216, "step": 6902 }, { "epoch": 2.166581662678018, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.611, "step": 6903 }, { "epoch": 2.1668955235591825, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 1.5473, "step": 6904 }, { "epoch": 2.167209384440347, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.4276, "step": 6905 }, { "epoch": 2.167523245321511, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 1.6521, "step": 6906 }, { "epoch": 2.1678371062026756, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 1.5468, "step": 6907 }, { "epoch": 2.16815096708384, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 1.5741, "step": 6908 }, { "epoch": 2.1684648279650043, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 1.5127, "step": 6909 }, { "epoch": 2.168778688846169, "grad_norm": 1.1796875, "learning_rate": 0.0002, "loss": 1.6309, "step": 6910 }, { "epoch": 2.1690925497273335, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.3051, "step": 6911 }, { "epoch": 2.169406410608498, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 1.7332, "step": 6912 }, { "epoch": 2.169720271489662, "grad_norm": 0.87890625, "learning_rate": 0.0002, "loss": 1.7025, "step": 6913 }, { "epoch": 2.1700341323708265, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 1.4458, "step": 6914 }, { "epoch": 2.170347993251991, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 1.2982, "step": 6915 }, { "epoch": 2.1706618541331553, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.9314, "step": 6916 }, { "epoch": 2.17097571501432, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 1.8198, "step": 6917 }, { "epoch": 2.1712895758954844, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 1.6126, "step": 6918 }, { "epoch": 2.171603436776649, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 1.4458, "step": 6919 }, { "epoch": 2.171603436776649, "eval_loss": 1.9108567237854004, "eval_runtime": 123.0755, "eval_samples_per_second": 8.125, "eval_steps_per_second": 8.125, "step": 6919 }, { "epoch": 2.171603436776649, "mmlu_eval_accuracy": 0.41087949761996856, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5714285714285714, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.6363636363636364, "mmlu_eval_accuracy_clinical_knowledge": 0.4482758620689655, "mmlu_eval_accuracy_college_biology": 0.25, "mmlu_eval_accuracy_college_chemistry": 0.0, "mmlu_eval_accuracy_college_computer_science": 0.2727272727272727, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.2727272727272727, "mmlu_eval_accuracy_conceptual_physics": 0.38461538461538464, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.4375, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.2857142857142857, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.34375, "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365, "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556, "mmlu_eval_accuracy_high_school_european_history": 0.6111111111111112, "mmlu_eval_accuracy_high_school_geography": 0.5909090909090909, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5238095238095238, "mmlu_eval_accuracy_high_school_macroeconomics": 0.4418604651162791, "mmlu_eval_accuracy_high_school_mathematics": 0.3103448275862069, "mmlu_eval_accuracy_high_school_microeconomics": 0.2692307692307692, "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, "mmlu_eval_accuracy_high_school_psychology": 0.5833333333333334, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.6363636363636364, "mmlu_eval_accuracy_high_school_world_history": 0.38461538461538464, "mmlu_eval_accuracy_human_aging": 0.6521739130434783, "mmlu_eval_accuracy_human_sexuality": 0.25, "mmlu_eval_accuracy_international_law": 0.7692307692307693, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.5555555555555556, "mmlu_eval_accuracy_machine_learning": 0.18181818181818182, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.76, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.5, "mmlu_eval_accuracy_moral_disputes": 0.5, "mmlu_eval_accuracy_moral_scenarios": 0.27, "mmlu_eval_accuracy_nutrition": 0.3939393939393939, "mmlu_eval_accuracy_philosophy": 0.5, "mmlu_eval_accuracy_prehistory": 0.2857142857142857, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.27647058823529413, "mmlu_eval_accuracy_professional_medicine": 0.5161290322580645, "mmlu_eval_accuracy_professional_psychology": 0.43478260869565216, "mmlu_eval_accuracy_public_relations": 0.5, "mmlu_eval_accuracy_security_studies": 0.2962962962962963, "mmlu_eval_accuracy_sociology": 0.45454545454545453, "mmlu_eval_accuracy_us_foreign_policy": 0.6363636363636364, "mmlu_eval_accuracy_virology": 0.3333333333333333, "mmlu_eval_accuracy_world_religions": 0.631578947368421, "mmlu_loss": 1.2637697486590125, "step": 6919 }, { "epoch": 2.171917297657813, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.2948, "step": 6920 }, { "epoch": 2.1722311585389775, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.8296, "step": 6921 }, { "epoch": 2.172545019420142, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 1.8349, "step": 6922 }, { "epoch": 2.1728588803013063, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 1.6083, "step": 6923 }, { "epoch": 2.173172741182471, "grad_norm": 0.22265625, "learning_rate": 0.0002, "loss": 1.4099, "step": 6924 }, { "epoch": 2.1734866020636354, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.159, "step": 6925 }, { "epoch": 2.1738004629448, "grad_norm": 0.2197265625, "learning_rate": 0.0002, "loss": 1.399, "step": 6926 }, { "epoch": 2.174114323825964, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.3964, "step": 6927 }, { "epoch": 2.1744281847071285, "grad_norm": 0.2177734375, "learning_rate": 0.0002, "loss": 1.1958, "step": 6928 }, { "epoch": 2.174742045588293, "grad_norm": 0.224609375, "learning_rate": 0.0002, "loss": 1.3266, "step": 6929 }, { "epoch": 2.1750559064694572, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.3394, "step": 6930 }, { "epoch": 2.175369767350622, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.2084, "step": 6931 }, { "epoch": 2.1756836282317864, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.4368, "step": 6932 }, { "epoch": 2.1759974891129508, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.1479, "step": 6933 }, { "epoch": 2.176311349994115, "grad_norm": 0.2333984375, "learning_rate": 0.0002, "loss": 1.2597, "step": 6934 }, { "epoch": 2.1766252108752795, "grad_norm": 0.2265625, "learning_rate": 0.0002, "loss": 1.0972, "step": 6935 }, { "epoch": 2.176939071756444, "grad_norm": 0.248046875, "learning_rate": 0.0002, "loss": 1.3139, "step": 6936 }, { "epoch": 2.177252932637608, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.2088, "step": 6937 }, { "epoch": 2.177566793518773, "grad_norm": 0.2470703125, "learning_rate": 0.0002, "loss": 1.2534, "step": 6938 }, { "epoch": 2.1778806543999374, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.2668, "step": 6939 }, { "epoch": 2.1781945152811018, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.3018, "step": 6940 }, { "epoch": 2.178508376162266, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.2134, "step": 6941 }, { "epoch": 2.1788222370434305, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.1556, "step": 6942 }, { "epoch": 2.179136097924595, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 1.1412, "step": 6943 }, { "epoch": 2.179449958805759, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 1.0205, "step": 6944 }, { "epoch": 2.1797638196869236, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.1013, "step": 6945 }, { "epoch": 2.1800776805680884, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.1996, "step": 6946 }, { "epoch": 2.1803915414492527, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 1.1859, "step": 6947 }, { "epoch": 2.180705402330417, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 1.1421, "step": 6948 }, { "epoch": 2.1810192632115815, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.2526, "step": 6949 }, { "epoch": 2.181333124092746, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 1.3302, "step": 6950 }, { "epoch": 2.18164698497391, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 1.5186, "step": 6951 }, { "epoch": 2.1819608458550745, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 1.2947, "step": 6952 }, { "epoch": 2.1822747067362394, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 1.526, "step": 6953 }, { "epoch": 2.1825885676174037, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 1.2987, "step": 6954 }, { "epoch": 2.182902428498568, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 1.2986, "step": 6955 }, { "epoch": 2.1832162893797324, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 1.4097, "step": 6956 }, { "epoch": 2.183530150260897, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.6956, "step": 6957 }, { "epoch": 2.183844011142061, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 1.4406, "step": 6958 }, { "epoch": 2.1841578720232255, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 1.5417, "step": 6959 }, { "epoch": 2.18447173290439, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 1.5466, "step": 6960 }, { "epoch": 2.1847855937855547, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.4767, "step": 6961 }, { "epoch": 2.185099454666719, "grad_norm": 1.125, "learning_rate": 0.0002, "loss": 1.9594, "step": 6962 }, { "epoch": 2.1854133155478834, "grad_norm": 0.97265625, "learning_rate": 0.0002, "loss": 1.803, "step": 6963 }, { "epoch": 2.185727176429048, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 2.0183, "step": 6964 }, { "epoch": 2.186041037310212, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.8014, "step": 6965 }, { "epoch": 2.1863548981913765, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.3876, "step": 6966 }, { "epoch": 2.186668759072541, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 2.0449, "step": 6967 }, { "epoch": 2.1869826199537057, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 1.6801, "step": 6968 }, { "epoch": 2.18729648083487, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.2796, "step": 6969 }, { "epoch": 2.1876103417160344, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 1.5317, "step": 6970 }, { "epoch": 2.1879242025971988, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.6394, "step": 6971 }, { "epoch": 2.188238063478363, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 1.4911, "step": 6972 }, { "epoch": 2.1885519243595275, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 1.6311, "step": 6973 }, { "epoch": 2.188865785240692, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.2183, "step": 6974 }, { "epoch": 2.1891796461218567, "grad_norm": 0.2333984375, "learning_rate": 0.0002, "loss": 1.3587, "step": 6975 }, { "epoch": 2.189493507003021, "grad_norm": 0.205078125, "learning_rate": 0.0002, "loss": 1.4424, "step": 6976 }, { "epoch": 2.1898073678841854, "grad_norm": 0.216796875, "learning_rate": 0.0002, "loss": 1.1984, "step": 6977 }, { "epoch": 2.1901212287653498, "grad_norm": 0.2275390625, "learning_rate": 0.0002, "loss": 1.3143, "step": 6978 }, { "epoch": 2.190435089646514, "grad_norm": 0.216796875, "learning_rate": 0.0002, "loss": 1.3413, "step": 6979 }, { "epoch": 2.1907489505276785, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.2222, "step": 6980 }, { "epoch": 2.191062811408843, "grad_norm": 0.220703125, "learning_rate": 0.0002, "loss": 1.1879, "step": 6981 }, { "epoch": 2.1913766722900077, "grad_norm": 0.2216796875, "learning_rate": 0.0002, "loss": 1.1602, "step": 6982 }, { "epoch": 2.191690533171172, "grad_norm": 0.2373046875, "learning_rate": 0.0002, "loss": 1.3725, "step": 6983 }, { "epoch": 2.1920043940523364, "grad_norm": 0.2265625, "learning_rate": 0.0002, "loss": 1.2544, "step": 6984 }, { "epoch": 2.1923182549335007, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.2809, "step": 6985 }, { "epoch": 2.192632115814665, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.0753, "step": 6986 }, { "epoch": 2.1929459766958295, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.1883, "step": 6987 }, { "epoch": 2.193259837576994, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.1687, "step": 6988 }, { "epoch": 2.1935736984581586, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.2067, "step": 6989 }, { "epoch": 2.193887559339323, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.0681, "step": 6990 }, { "epoch": 2.1942014202204874, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.0512, "step": 6991 }, { "epoch": 2.1945152811016517, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.2436, "step": 6992 }, { "epoch": 2.194829141982816, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.0949, "step": 6993 }, { "epoch": 2.1951430028639805, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.1986, "step": 6994 }, { "epoch": 2.195456863745145, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 1.2812, "step": 6995 }, { "epoch": 2.1957707246263096, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 1.2332, "step": 6996 }, { "epoch": 2.196084585507474, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.0782, "step": 6997 }, { "epoch": 2.1963984463886383, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 1.2484, "step": 6998 }, { "epoch": 2.1967123072698027, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 1.347, "step": 6999 }, { "epoch": 2.197026168150967, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 1.2324, "step": 7000 }, { "epoch": 2.1973400290321314, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 1.2807, "step": 7001 }, { "epoch": 2.197653889913296, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 1.6561, "step": 7002 }, { "epoch": 2.19796775079446, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 1.2975, "step": 7003 }, { "epoch": 2.198281611675625, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 1.3722, "step": 7004 }, { "epoch": 2.1985954725567893, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 1.5477, "step": 7005 }, { "epoch": 2.1989093334379537, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 1.5554, "step": 7006 }, { "epoch": 2.199223194319118, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.5507, "step": 7007 }, { "epoch": 2.1995370552002824, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 1.4281, "step": 7008 }, { "epoch": 2.199850916081447, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 1.5234, "step": 7009 }, { "epoch": 2.200164776962611, "grad_norm": 1.125, "learning_rate": 0.0002, "loss": 1.9167, "step": 7010 }, { "epoch": 2.200478637843776, "grad_norm": 1.21875, "learning_rate": 0.0002, "loss": 1.6815, "step": 7011 }, { "epoch": 2.2007924987249403, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 1.6578, "step": 7012 }, { "epoch": 2.2011063596061047, "grad_norm": 0.90625, "learning_rate": 0.0002, "loss": 1.7545, "step": 7013 }, { "epoch": 2.201420220487269, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.8026, "step": 7014 }, { "epoch": 2.2017340813684334, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.4013, "step": 7015 }, { "epoch": 2.2020479422495978, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 1.7612, "step": 7016 }, { "epoch": 2.202361803130762, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 1.5383, "step": 7017 }, { "epoch": 2.2026756640119265, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 1.3195, "step": 7018 }, { "epoch": 2.2029895248930913, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 1.4813, "step": 7019 }, { "epoch": 2.2033033857742557, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 1.6449, "step": 7020 }, { "epoch": 2.20361724665542, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 1.4753, "step": 7021 }, { "epoch": 2.2039311075365844, "grad_norm": 0.97265625, "learning_rate": 0.0002, "loss": 1.9823, "step": 7022 }, { "epoch": 2.2042449684177488, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.7635, "step": 7023 }, { "epoch": 2.204558829298913, "grad_norm": 0.1923828125, "learning_rate": 0.0002, "loss": 1.3181, "step": 7024 }, { "epoch": 2.2048726901800775, "grad_norm": 0.1875, "learning_rate": 0.0002, "loss": 1.1876, "step": 7025 }, { "epoch": 2.2051865510612423, "grad_norm": 0.2275390625, "learning_rate": 0.0002, "loss": 1.2938, "step": 7026 }, { "epoch": 2.2055004119424066, "grad_norm": 0.23828125, "learning_rate": 0.0002, "loss": 1.265, "step": 7027 }, { "epoch": 2.205814272823571, "grad_norm": 0.22265625, "learning_rate": 0.0002, "loss": 1.21, "step": 7028 }, { "epoch": 2.2061281337047354, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 1.3629, "step": 7029 }, { "epoch": 2.2064419945858997, "grad_norm": 0.220703125, "learning_rate": 0.0002, "loss": 1.2804, "step": 7030 }, { "epoch": 2.206755855467064, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.3064, "step": 7031 }, { "epoch": 2.2070697163482285, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.2229, "step": 7032 }, { "epoch": 2.2073835772293933, "grad_norm": 0.2412109375, "learning_rate": 0.0002, "loss": 1.1505, "step": 7033 }, { "epoch": 2.2076974381105576, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.2146, "step": 7034 }, { "epoch": 2.208011298991722, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 1.1732, "step": 7035 }, { "epoch": 2.2083251598728864, "grad_norm": 0.22265625, "learning_rate": 0.0002, "loss": 1.1353, "step": 7036 }, { "epoch": 2.2086390207540507, "grad_norm": 0.2490234375, "learning_rate": 0.0002, "loss": 1.1966, "step": 7037 }, { "epoch": 2.208952881635215, "grad_norm": 0.23828125, "learning_rate": 0.0002, "loss": 1.2293, "step": 7038 }, { "epoch": 2.2092667425163794, "grad_norm": 0.228515625, "learning_rate": 0.0002, "loss": 1.1095, "step": 7039 }, { "epoch": 2.2095806033975443, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.0926, "step": 7040 }, { "epoch": 2.2098944642787086, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.0617, "step": 7041 }, { "epoch": 2.210208325159873, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.1509, "step": 7042 }, { "epoch": 2.2105221860410373, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.1196, "step": 7043 }, { "epoch": 2.2108360469222017, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.222, "step": 7044 }, { "epoch": 2.211149907803366, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 1.0556, "step": 7045 }, { "epoch": 2.2114637686845304, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.1312, "step": 7046 }, { "epoch": 2.2117776295656952, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 1.1114, "step": 7047 }, { "epoch": 2.2120914904468596, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 1.0889, "step": 7048 }, { "epoch": 2.212405351328024, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.066, "step": 7049 }, { "epoch": 2.2127192122091883, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 1.3159, "step": 7050 }, { "epoch": 2.2130330730903527, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 1.2005, "step": 7051 }, { "epoch": 2.213346933971517, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 1.4483, "step": 7052 }, { "epoch": 2.2136607948526814, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 1.5394, "step": 7053 }, { "epoch": 2.2139746557338458, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 1.2109, "step": 7054 }, { "epoch": 2.2142885166150106, "grad_norm": 0.9296875, "learning_rate": 0.0002, "loss": 1.902, "step": 7055 }, { "epoch": 2.214602377496175, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 1.7741, "step": 7056 }, { "epoch": 2.2149162383773393, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 1.4993, "step": 7057 }, { "epoch": 2.2152300992585037, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 1.4955, "step": 7058 }, { "epoch": 2.215543960139668, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 1.4148, "step": 7059 }, { "epoch": 2.2158578210208324, "grad_norm": 0.95703125, "learning_rate": 0.0002, "loss": 1.8686, "step": 7060 }, { "epoch": 2.2161716819019968, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 1.7474, "step": 7061 }, { "epoch": 2.2164855427831616, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 1.6311, "step": 7062 }, { "epoch": 2.216799403664326, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 1.5525, "step": 7063 }, { "epoch": 2.2171132645454903, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.8235, "step": 7064 }, { "epoch": 2.2174271254266547, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 1.6988, "step": 7065 }, { "epoch": 2.217740986307819, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 2.1018, "step": 7066 }, { "epoch": 2.2180548471889834, "grad_norm": 1.25, "learning_rate": 0.0002, "loss": 1.8875, "step": 7067 }, { "epoch": 2.2183687080701477, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 2.0372, "step": 7068 }, { "epoch": 2.218682568951312, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.2988, "step": 7069 }, { "epoch": 2.218996429832477, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 1.523, "step": 7070 }, { "epoch": 2.2193102907136413, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 1.7166, "step": 7071 }, { "epoch": 2.2196241515948056, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 1.7636, "step": 7072 }, { "epoch": 2.21993801247597, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.4759, "step": 7073 }, { "epoch": 2.2202518733571344, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.2731, "step": 7074 }, { "epoch": 2.2205657342382987, "grad_norm": 0.189453125, "learning_rate": 0.0002, "loss": 1.1947, "step": 7075 }, { "epoch": 2.220879595119463, "grad_norm": 0.1884765625, "learning_rate": 0.0002, "loss": 1.2262, "step": 7076 }, { "epoch": 2.221193456000628, "grad_norm": 0.2197265625, "learning_rate": 0.0002, "loss": 1.4372, "step": 7077 }, { "epoch": 2.2215073168817923, "grad_norm": 0.228515625, "learning_rate": 0.0002, "loss": 1.2085, "step": 7078 }, { "epoch": 2.2218211777629566, "grad_norm": 0.2021484375, "learning_rate": 0.0002, "loss": 1.2133, "step": 7079 }, { "epoch": 2.222135038644121, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.256, "step": 7080 }, { "epoch": 2.2224488995252853, "grad_norm": 0.216796875, "learning_rate": 0.0002, "loss": 1.1858, "step": 7081 }, { "epoch": 2.2227627604064497, "grad_norm": 0.2255859375, "learning_rate": 0.0002, "loss": 1.2447, "step": 7082 }, { "epoch": 2.223076621287614, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.1893, "step": 7083 }, { "epoch": 2.223390482168779, "grad_norm": 0.2373046875, "learning_rate": 0.0002, "loss": 1.3268, "step": 7084 }, { "epoch": 2.2237043430499432, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 1.1757, "step": 7085 }, { "epoch": 2.2240182039311076, "grad_norm": 0.2490234375, "learning_rate": 0.0002, "loss": 1.1648, "step": 7086 }, { "epoch": 2.224332064812272, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.3042, "step": 7087 }, { "epoch": 2.2246459256934363, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.261, "step": 7088 }, { "epoch": 2.2249597865746007, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.2222, "step": 7089 }, { "epoch": 2.225273647455765, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 1.1377, "step": 7090 }, { "epoch": 2.22558750833693, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.1968, "step": 7091 }, { "epoch": 2.2259013692180942, "grad_norm": 0.240234375, "learning_rate": 0.0002, "loss": 1.0236, "step": 7092 }, { "epoch": 2.2262152300992586, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.0986, "step": 7093 }, { "epoch": 2.226529090980423, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.205, "step": 7094 }, { "epoch": 2.2268429518615873, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 1.2683, "step": 7095 }, { "epoch": 2.2271568127427517, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.2679, "step": 7096 }, { "epoch": 2.227470673623916, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.156, "step": 7097 }, { "epoch": 2.227784534505081, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 1.1063, "step": 7098 }, { "epoch": 2.228098395386245, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 1.1573, "step": 7099 }, { "epoch": 2.2284122562674096, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 1.1298, "step": 7100 }, { "epoch": 2.228726117148574, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 1.3207, "step": 7101 }, { "epoch": 2.2290399780297383, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 1.22, "step": 7102 }, { "epoch": 2.2293538389109027, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 1.2568, "step": 7103 }, { "epoch": 2.229667699792067, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 1.303, "step": 7104 }, { "epoch": 2.2299815606732314, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.5303, "step": 7105 }, { "epoch": 2.230295421554396, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 1.5333, "step": 7106 }, { "epoch": 2.230295421554396, "eval_loss": 1.8770476579666138, "eval_runtime": 123.3685, "eval_samples_per_second": 8.106, "eval_steps_per_second": 8.106, "step": 7106 }, { "epoch": 2.230295421554396, "mmlu_eval_accuracy": 0.4073725220978079, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.5, "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, "mmlu_eval_accuracy_clinical_knowledge": 0.3793103448275862, "mmlu_eval_accuracy_college_biology": 0.25, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.45454545454545453, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.09090909090909091, "mmlu_eval_accuracy_conceptual_physics": 0.5, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.3125, "mmlu_eval_accuracy_elementary_mathematics": 0.21951219512195122, "mmlu_eval_accuracy_formal_logic": 0.42857142857142855, "mmlu_eval_accuracy_global_facts": 0.4, "mmlu_eval_accuracy_high_school_biology": 0.28125, "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365, "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, "mmlu_eval_accuracy_high_school_european_history": 0.6666666666666666, "mmlu_eval_accuracy_high_school_geography": 0.5, "mmlu_eval_accuracy_high_school_government_and_politics": 0.47619047619047616, "mmlu_eval_accuracy_high_school_macroeconomics": 0.46511627906976744, "mmlu_eval_accuracy_high_school_mathematics": 0.27586206896551724, "mmlu_eval_accuracy_high_school_microeconomics": 0.2692307692307692, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.5333333333333333, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.5454545454545454, "mmlu_eval_accuracy_high_school_world_history": 0.4230769230769231, "mmlu_eval_accuracy_human_aging": 0.5652173913043478, "mmlu_eval_accuracy_human_sexuality": 0.3333333333333333, "mmlu_eval_accuracy_international_law": 0.8461538461538461, "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, "mmlu_eval_accuracy_logical_fallacies": 0.4444444444444444, "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.76, "mmlu_eval_accuracy_medical_genetics": 0.6363636363636364, "mmlu_eval_accuracy_miscellaneous": 0.5232558139534884, "mmlu_eval_accuracy_moral_disputes": 0.4473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.31, "mmlu_eval_accuracy_nutrition": 0.36363636363636365, "mmlu_eval_accuracy_philosophy": 0.38235294117647056, "mmlu_eval_accuracy_prehistory": 0.3142857142857143, "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, "mmlu_eval_accuracy_professional_law": 0.32941176470588235, "mmlu_eval_accuracy_professional_medicine": 0.45161290322580644, "mmlu_eval_accuracy_professional_psychology": 0.391304347826087, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.5, "mmlu_eval_accuracy_us_foreign_policy": 0.7272727272727273, "mmlu_eval_accuracy_virology": 0.2777777777777778, "mmlu_eval_accuracy_world_religions": 0.631578947368421, "mmlu_loss": 1.329874158294897, "step": 7106 }, { "epoch": 2.2306092824355606, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 1.5482, "step": 7107 }, { "epoch": 2.230923143316725, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 1.9437, "step": 7108 }, { "epoch": 2.2312370041978893, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 1.5668, "step": 7109 }, { "epoch": 2.2315508650790536, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 1.4053, "step": 7110 }, { "epoch": 2.231864725960218, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.5723, "step": 7111 }, { "epoch": 2.2321785868413824, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 1.7933, "step": 7112 }, { "epoch": 2.232492447722547, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 1.4577, "step": 7113 }, { "epoch": 2.2328063086037115, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 1.2109, "step": 7114 }, { "epoch": 2.233120169484876, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 1.6556, "step": 7115 }, { "epoch": 2.2334340303660403, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 1.8344, "step": 7116 }, { "epoch": 2.2337478912472046, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 1.5766, "step": 7117 }, { "epoch": 2.234061752128369, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 1.3747, "step": 7118 }, { "epoch": 2.2343756130095334, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 1.3569, "step": 7119 }, { "epoch": 2.2346894738906977, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 1.3268, "step": 7120 }, { "epoch": 2.2350033347718625, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 1.7115, "step": 7121 }, { "epoch": 2.235317195653027, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 2.0211, "step": 7122 }, { "epoch": 2.2356310565341913, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 1.5885, "step": 7123 }, { "epoch": 2.2359449174153556, "grad_norm": 0.189453125, "learning_rate": 0.0002, "loss": 1.2717, "step": 7124 }, { "epoch": 2.23625877829652, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.3483, "step": 7125 }, { "epoch": 2.2365726391776843, "grad_norm": 0.2177734375, "learning_rate": 0.0002, "loss": 1.385, "step": 7126 }, { "epoch": 2.2368865000588487, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.4571, "step": 7127 }, { "epoch": 2.2372003609400135, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.3417, "step": 7128 }, { "epoch": 2.237514221821178, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.249, "step": 7129 }, { "epoch": 2.2378280827023422, "grad_norm": 0.236328125, "learning_rate": 0.0002, "loss": 1.2537, "step": 7130 }, { "epoch": 2.2381419435835066, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.2247, "step": 7131 }, { "epoch": 2.238455804464671, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 1.449, "step": 7132 }, { "epoch": 2.2387696653458353, "grad_norm": 0.2412109375, "learning_rate": 0.0002, "loss": 1.3207, "step": 7133 }, { "epoch": 2.2390835262269997, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.2411, "step": 7134 }, { "epoch": 2.2393973871081645, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.1709, "step": 7135 }, { "epoch": 2.239711247989329, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 1.1944, "step": 7136 }, { "epoch": 2.240025108870493, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.42, "step": 7137 }, { "epoch": 2.2403389697516576, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.1596, "step": 7138 }, { "epoch": 2.240652830632822, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.3256, "step": 7139 }, { "epoch": 2.2409666915139863, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.066, "step": 7140 }, { "epoch": 2.2412805523951507, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 1.314, "step": 7141 }, { "epoch": 2.2415944132763155, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.0471, "step": 7142 }, { "epoch": 2.24190827415748, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.1894, "step": 7143 }, { "epoch": 2.242222135038644, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 1.4259, "step": 7144 }, { "epoch": 2.2425359959198086, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 1.1759, "step": 7145 }, { "epoch": 2.242849856800973, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.0261, "step": 7146 }, { "epoch": 2.2431637176821373, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.4349, "step": 7147 }, { "epoch": 2.2434775785633017, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 1.2358, "step": 7148 }, { "epoch": 2.2437914394444665, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.3888, "step": 7149 }, { "epoch": 2.244105300325631, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 1.1028, "step": 7150 }, { "epoch": 2.244419161206795, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.4284, "step": 7151 }, { "epoch": 2.2447330220879596, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.2476, "step": 7152 }, { "epoch": 2.245046882969124, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 1.7325, "step": 7153 }, { "epoch": 2.2453607438502883, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 1.2499, "step": 7154 }, { "epoch": 2.2456746047314526, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 1.5747, "step": 7155 }, { "epoch": 2.245988465612617, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 1.5954, "step": 7156 }, { "epoch": 2.246302326493782, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 1.643, "step": 7157 }, { "epoch": 2.246616187374946, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 1.609, "step": 7158 }, { "epoch": 2.2469300482561105, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 1.9222, "step": 7159 }, { "epoch": 2.247243909137275, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 1.3925, "step": 7160 }, { "epoch": 2.2475577700184393, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 1.4812, "step": 7161 }, { "epoch": 2.2478716308996036, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 1.9347, "step": 7162 }, { "epoch": 2.248185491780768, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 1.7704, "step": 7163 }, { "epoch": 2.248499352661933, "grad_norm": 0.9453125, "learning_rate": 0.0002, "loss": 1.734, "step": 7164 }, { "epoch": 2.248813213543097, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 1.2706, "step": 7165 }, { "epoch": 2.2491270744242615, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.4148, "step": 7166 }, { "epoch": 2.249440935305426, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 1.772, "step": 7167 }, { "epoch": 2.2497547961865902, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 1.549, "step": 7168 }, { "epoch": 2.2500686570677546, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 1.2603, "step": 7169 }, { "epoch": 2.250382517948919, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 1.5354, "step": 7170 }, { "epoch": 2.2506963788300833, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 2.1267, "step": 7171 }, { "epoch": 2.251010239711248, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 2.3051, "step": 7172 }, { "epoch": 2.2513241005924125, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 1.5763, "step": 7173 }, { "epoch": 2.251637961473577, "grad_norm": 0.2060546875, "learning_rate": 0.0002, "loss": 1.2335, "step": 7174 }, { "epoch": 2.2519518223547412, "grad_norm": 0.2392578125, "learning_rate": 0.0002, "loss": 1.2123, "step": 7175 }, { "epoch": 2.2522656832359056, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.5065, "step": 7176 }, { "epoch": 2.25257954411707, "grad_norm": 0.2119140625, "learning_rate": 0.0002, "loss": 1.1668, "step": 7177 }, { "epoch": 2.2528934049982343, "grad_norm": 0.228515625, "learning_rate": 0.0002, "loss": 1.4468, "step": 7178 }, { "epoch": 2.253207265879399, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.2751, "step": 7179 }, { "epoch": 2.2535211267605635, "grad_norm": 0.220703125, "learning_rate": 0.0002, "loss": 1.1354, "step": 7180 }, { "epoch": 2.253834987641728, "grad_norm": 0.21484375, "learning_rate": 0.0002, "loss": 1.2021, "step": 7181 }, { "epoch": 2.254148848522892, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 1.0891, "step": 7182 }, { "epoch": 2.2544627094040566, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.2342, "step": 7183 }, { "epoch": 2.254776570285221, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 1.2501, "step": 7184 }, { "epoch": 2.2550904311663853, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.326, "step": 7185 }, { "epoch": 2.25540429204755, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.2328, "step": 7186 }, { "epoch": 2.2557181529287145, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.2954, "step": 7187 }, { "epoch": 2.256032013809879, "grad_norm": 0.244140625, "learning_rate": 0.0002, "loss": 1.161, "step": 7188 }, { "epoch": 2.256345874691043, "grad_norm": 0.23828125, "learning_rate": 0.0002, "loss": 1.1598, "step": 7189 }, { "epoch": 2.2566597355722076, "grad_norm": 0.23828125, "learning_rate": 0.0002, "loss": 1.1464, "step": 7190 }, { "epoch": 2.256973596453372, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.1065, "step": 7191 }, { "epoch": 2.2572874573345363, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.2306, "step": 7192 }, { "epoch": 2.257601318215701, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.1382, "step": 7193 }, { "epoch": 2.2579151790968655, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.1865, "step": 7194 }, { "epoch": 2.25822903997803, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.2209, "step": 7195 }, { "epoch": 2.258542900859194, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.1252, "step": 7196 }, { "epoch": 2.2588567617403585, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 1.3115, "step": 7197 }, { "epoch": 2.259170622621523, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 1.0223, "step": 7198 }, { "epoch": 2.2594844835026873, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.2491, "step": 7199 }, { "epoch": 2.259798344383852, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.3421, "step": 7200 }, { "epoch": 2.2601122052650164, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 1.2766, "step": 7201 }, { "epoch": 2.260426066146181, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 1.0719, "step": 7202 }, { "epoch": 2.260739927027345, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 1.1353, "step": 7203 }, { "epoch": 2.2610537879085095, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 1.3326, "step": 7204 }, { "epoch": 2.261367648789674, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.6516, "step": 7205 }, { "epoch": 2.2616815096708383, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 1.3945, "step": 7206 }, { "epoch": 2.261995370552003, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 1.9937, "step": 7207 }, { "epoch": 2.2623092314331674, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 1.6256, "step": 7208 }, { "epoch": 2.262623092314332, "grad_norm": 0.98046875, "learning_rate": 0.0002, "loss": 1.6463, "step": 7209 }, { "epoch": 2.262936953195496, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.5671, "step": 7210 }, { "epoch": 2.2632508140766605, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 1.4976, "step": 7211 }, { "epoch": 2.263564674957825, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 1.4574, "step": 7212 }, { "epoch": 2.2638785358389892, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 2.0653, "step": 7213 }, { "epoch": 2.2641923967201536, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 1.6966, "step": 7214 }, { "epoch": 2.2645062576013184, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 1.6403, "step": 7215 }, { "epoch": 2.2648201184824828, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 1.7164, "step": 7216 }, { "epoch": 2.265133979363647, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 1.7745, "step": 7217 }, { "epoch": 2.2654478402448115, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 1.2363, "step": 7218 }, { "epoch": 2.265761701125976, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 1.526, "step": 7219 }, { "epoch": 2.26607556200714, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 1.691, "step": 7220 }, { "epoch": 2.2663894228883046, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 1.7359, "step": 7221 }, { "epoch": 2.266703283769469, "grad_norm": 0.921875, "learning_rate": 0.0002, "loss": 2.234, "step": 7222 }, { "epoch": 2.2670171446506338, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 1.6254, "step": 7223 }, { "epoch": 2.267331005531798, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.2291, "step": 7224 }, { "epoch": 2.2676448664129625, "grad_norm": 0.248046875, "learning_rate": 0.0002, "loss": 1.3581, "step": 7225 }, { "epoch": 2.267958727294127, "grad_norm": 0.2265625, "learning_rate": 0.0002, "loss": 1.3394, "step": 7226 }, { "epoch": 2.268272588175291, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.3901, "step": 7227 }, { "epoch": 2.2685864490564556, "grad_norm": 0.20703125, "learning_rate": 0.0002, "loss": 1.286, "step": 7228 }, { "epoch": 2.26890030993762, "grad_norm": 0.2158203125, "learning_rate": 0.0002, "loss": 1.2478, "step": 7229 }, { "epoch": 2.2692141708187847, "grad_norm": 0.2490234375, "learning_rate": 0.0002, "loss": 1.298, "step": 7230 }, { "epoch": 2.269528031699949, "grad_norm": 0.228515625, "learning_rate": 0.0002, "loss": 1.1992, "step": 7231 }, { "epoch": 2.2698418925811135, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.1478, "step": 7232 }, { "epoch": 2.270155753462278, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.1495, "step": 7233 }, { "epoch": 2.270469614343442, "grad_norm": 0.2236328125, "learning_rate": 0.0002, "loss": 1.1202, "step": 7234 }, { "epoch": 2.2707834752246066, "grad_norm": 0.2197265625, "learning_rate": 0.0002, "loss": 1.1152, "step": 7235 }, { "epoch": 2.271097336105771, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.2451, "step": 7236 }, { "epoch": 2.2714111969869357, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.1696, "step": 7237 }, { "epoch": 2.2717250578681, "grad_norm": 0.2392578125, "learning_rate": 0.0002, "loss": 1.1456, "step": 7238 }, { "epoch": 2.2720389187492644, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.2046, "step": 7239 }, { "epoch": 2.272352779630429, "grad_norm": 0.2275390625, "learning_rate": 0.0002, "loss": 1.0277, "step": 7240 }, { "epoch": 2.272666640511593, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.1189, "step": 7241 }, { "epoch": 2.2729805013927575, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.1998, "step": 7242 }, { "epoch": 2.273294362273922, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.2429, "step": 7243 }, { "epoch": 2.2736082231550867, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.1494, "step": 7244 }, { "epoch": 2.273922084036251, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.1338, "step": 7245 }, { "epoch": 2.2742359449174154, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 1.2464, "step": 7246 }, { "epoch": 2.27454980579858, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 1.1089, "step": 7247 }, { "epoch": 2.274863666679744, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.2631, "step": 7248 }, { "epoch": 2.2751775275609085, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 1.3957, "step": 7249 }, { "epoch": 2.275491388442073, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 1.3219, "step": 7250 }, { "epoch": 2.2758052493232377, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 1.3588, "step": 7251 }, { "epoch": 2.276119110204402, "grad_norm": 0.98828125, "learning_rate": 0.0002, "loss": 1.6856, "step": 7252 }, { "epoch": 2.2764329710855664, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 1.1168, "step": 7253 }, { "epoch": 2.2767468319667308, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 1.5774, "step": 7254 }, { "epoch": 2.277060692847895, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 1.6688, "step": 7255 }, { "epoch": 2.2773745537290595, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 1.4973, "step": 7256 }, { "epoch": 2.277688414610224, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 1.688, "step": 7257 }, { "epoch": 2.2780022754913887, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 1.2582, "step": 7258 }, { "epoch": 2.278316136372553, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.5272, "step": 7259 }, { "epoch": 2.2786299972537174, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 1.5659, "step": 7260 }, { "epoch": 2.2789438581348818, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 1.6533, "step": 7261 }, { "epoch": 2.279257719016046, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 1.6592, "step": 7262 }, { "epoch": 2.2795715798972105, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 1.5342, "step": 7263 }, { "epoch": 2.279885440778375, "grad_norm": 0.96875, "learning_rate": 0.0002, "loss": 1.795, "step": 7264 }, { "epoch": 2.2801993016595397, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 1.6707, "step": 7265 }, { "epoch": 2.280513162540704, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 2.1182, "step": 7266 }, { "epoch": 2.2808270234218684, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 1.7529, "step": 7267 }, { "epoch": 2.2811408843030327, "grad_norm": 1.4765625, "learning_rate": 0.0002, "loss": 1.5335, "step": 7268 }, { "epoch": 2.281454745184197, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 1.4642, "step": 7269 }, { "epoch": 2.2817686060653615, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 1.5671, "step": 7270 }, { "epoch": 2.282082466946526, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 1.6643, "step": 7271 }, { "epoch": 2.28239632782769, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 1.7303, "step": 7272 }, { "epoch": 2.2827101887088546, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.6393, "step": 7273 }, { "epoch": 2.2830240495900194, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.2254, "step": 7274 }, { "epoch": 2.2833379104711837, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.2888, "step": 7275 }, { "epoch": 2.283651771352348, "grad_norm": 0.2099609375, "learning_rate": 0.0002, "loss": 1.3103, "step": 7276 }, { "epoch": 2.2839656322335125, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.2934, "step": 7277 }, { "epoch": 2.284279493114677, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 1.1373, "step": 7278 }, { "epoch": 2.284593353995841, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 1.2261, "step": 7279 }, { "epoch": 2.2849072148770055, "grad_norm": 0.2373046875, "learning_rate": 0.0002, "loss": 1.2468, "step": 7280 }, { "epoch": 2.2852210757581704, "grad_norm": 0.2109375, "learning_rate": 0.0002, "loss": 1.1644, "step": 7281 }, { "epoch": 2.2855349366393347, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.3607, "step": 7282 }, { "epoch": 2.285848797520499, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.1685, "step": 7283 }, { "epoch": 2.2861626584016634, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.2223, "step": 7284 }, { "epoch": 2.286476519282828, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.2889, "step": 7285 }, { "epoch": 2.286790380163992, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.276, "step": 7286 }, { "epoch": 2.2871042410451565, "grad_norm": 0.2412109375, "learning_rate": 0.0002, "loss": 1.1342, "step": 7287 }, { "epoch": 2.2874181019263213, "grad_norm": 0.2490234375, "learning_rate": 0.0002, "loss": 1.2093, "step": 7288 }, { "epoch": 2.2877319628074857, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.2935, "step": 7289 }, { "epoch": 2.28804582368865, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.1321, "step": 7290 }, { "epoch": 2.2883596845698144, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.0876, "step": 7291 }, { "epoch": 2.288673545450979, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.1969, "step": 7292 }, { "epoch": 2.288987406332143, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.0424, "step": 7293 }, { "epoch": 2.288987406332143, "eval_loss": 1.8512505292892456, "eval_runtime": 123.2149, "eval_samples_per_second": 8.116, "eval_steps_per_second": 8.116, "step": 7293 }, { "epoch": 2.288987406332143, "mmlu_eval_accuracy": 0.4048621994273181, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.6363636363636364, "mmlu_eval_accuracy_clinical_knowledge": 0.3793103448275862, "mmlu_eval_accuracy_college_biology": 0.1875, "mmlu_eval_accuracy_college_chemistry": 0.0, "mmlu_eval_accuracy_college_computer_science": 0.45454545454545453, "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.18181818181818182, "mmlu_eval_accuracy_conceptual_physics": 0.34615384615384615, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.3125, "mmlu_eval_accuracy_elementary_mathematics": 0.21951219512195122, "mmlu_eval_accuracy_formal_logic": 0.2857142857142857, "mmlu_eval_accuracy_global_facts": 0.4, "mmlu_eval_accuracy_high_school_biology": 0.28125, "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556, "mmlu_eval_accuracy_high_school_european_history": 0.5, "mmlu_eval_accuracy_high_school_geography": 0.5454545454545454, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5238095238095238, "mmlu_eval_accuracy_high_school_macroeconomics": 0.4186046511627907, "mmlu_eval_accuracy_high_school_mathematics": 0.20689655172413793, "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.55, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.5909090909090909, "mmlu_eval_accuracy_high_school_world_history": 0.5, "mmlu_eval_accuracy_human_aging": 0.6956521739130435, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.8461538461538461, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.6111111111111112, "mmlu_eval_accuracy_machine_learning": 0.09090909090909091, "mmlu_eval_accuracy_management": 0.45454545454545453, "mmlu_eval_accuracy_marketing": 0.72, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.5348837209302325, "mmlu_eval_accuracy_moral_disputes": 0.4473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.24, "mmlu_eval_accuracy_nutrition": 0.36363636363636365, "mmlu_eval_accuracy_philosophy": 0.4117647058823529, "mmlu_eval_accuracy_prehistory": 0.34285714285714286, "mmlu_eval_accuracy_professional_accounting": 0.3870967741935484, "mmlu_eval_accuracy_professional_law": 0.3, "mmlu_eval_accuracy_professional_medicine": 0.3870967741935484, "mmlu_eval_accuracy_professional_psychology": 0.42028985507246375, "mmlu_eval_accuracy_public_relations": 0.5, "mmlu_eval_accuracy_security_studies": 0.4074074074074074, "mmlu_eval_accuracy_sociology": 0.5, "mmlu_eval_accuracy_us_foreign_policy": 0.5454545454545454, "mmlu_eval_accuracy_virology": 0.3888888888888889, "mmlu_eval_accuracy_world_religions": 0.631578947368421, "mmlu_loss": 1.305676959644806, "step": 7293 }, { "epoch": 2.2893012672133075, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.2267, "step": 7294 }, { "epoch": 2.2896151280944723, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.2125, "step": 7295 }, { "epoch": 2.2899289889756367, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.1225, "step": 7296 }, { "epoch": 2.290242849856801, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 1.2745, "step": 7297 }, { "epoch": 2.2905567107379654, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 1.113, "step": 7298 }, { "epoch": 2.2908705716191298, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 1.0614, "step": 7299 }, { "epoch": 2.291184432500294, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.0341, "step": 7300 }, { "epoch": 2.2914982933814585, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 1.3243, "step": 7301 }, { "epoch": 2.2918121542626233, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.1798, "step": 7302 }, { "epoch": 2.2921260151437877, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 1.1524, "step": 7303 }, { "epoch": 2.292439876024952, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 1.4504, "step": 7304 }, { "epoch": 2.2927537369061164, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.362, "step": 7305 }, { "epoch": 2.2930675977872808, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 1.2939, "step": 7306 }, { "epoch": 2.293381458668445, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 1.5712, "step": 7307 }, { "epoch": 2.2936953195496095, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 1.7939, "step": 7308 }, { "epoch": 2.2940091804307743, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 1.5016, "step": 7309 }, { "epoch": 2.2943230413119386, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 1.8059, "step": 7310 }, { "epoch": 2.294636902193103, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.3105, "step": 7311 }, { "epoch": 2.2949507630742674, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.532, "step": 7312 }, { "epoch": 2.2952646239554317, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 1.5416, "step": 7313 }, { "epoch": 2.295578484836596, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 1.8934, "step": 7314 }, { "epoch": 2.2958923457177605, "grad_norm": 1.828125, "learning_rate": 0.0002, "loss": 1.5368, "step": 7315 }, { "epoch": 2.2962062065989253, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 1.4952, "step": 7316 }, { "epoch": 2.2965200674800896, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.3947, "step": 7317 }, { "epoch": 2.296833928361254, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 1.5034, "step": 7318 }, { "epoch": 2.2971477892424184, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 1.6191, "step": 7319 }, { "epoch": 2.2974616501235827, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 1.3039, "step": 7320 }, { "epoch": 2.297775511004747, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 1.8442, "step": 7321 }, { "epoch": 2.2980893718859114, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 1.7986, "step": 7322 }, { "epoch": 2.298403232767076, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.4466, "step": 7323 }, { "epoch": 2.29871709364824, "grad_norm": 0.2216796875, "learning_rate": 0.0002, "loss": 1.3619, "step": 7324 }, { "epoch": 2.299030954529405, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.1874, "step": 7325 }, { "epoch": 2.2993448154105693, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.3146, "step": 7326 }, { "epoch": 2.2996586762917337, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.3598, "step": 7327 }, { "epoch": 2.299972537172898, "grad_norm": 0.22265625, "learning_rate": 0.0002, "loss": 1.3186, "step": 7328 }, { "epoch": 2.3002863980540624, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.3518, "step": 7329 }, { "epoch": 2.300600258935227, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.376, "step": 7330 }, { "epoch": 2.300914119816391, "grad_norm": 0.234375, "learning_rate": 0.0002, "loss": 1.1383, "step": 7331 }, { "epoch": 2.301227980697556, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.2397, "step": 7332 }, { "epoch": 2.3015418415787203, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.1586, "step": 7333 }, { "epoch": 2.3018557024598847, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.3042, "step": 7334 }, { "epoch": 2.302169563341049, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.3113, "step": 7335 }, { "epoch": 2.3024834242222134, "grad_norm": 0.234375, "learning_rate": 0.0002, "loss": 1.1487, "step": 7336 }, { "epoch": 2.3027972851033778, "grad_norm": 0.2451171875, "learning_rate": 0.0002, "loss": 1.2513, "step": 7337 }, { "epoch": 2.303111145984542, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.2273, "step": 7338 }, { "epoch": 2.303425006865707, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 1.2175, "step": 7339 }, { "epoch": 2.3037388677468713, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.0618, "step": 7340 }, { "epoch": 2.3040527286280357, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 1.1736, "step": 7341 }, { "epoch": 2.3043665895092, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.2536, "step": 7342 }, { "epoch": 2.3046804503903644, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.1165, "step": 7343 }, { "epoch": 2.3049943112715288, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.1144, "step": 7344 }, { "epoch": 2.305308172152693, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 1.2865, "step": 7345 }, { "epoch": 2.305622033033858, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.1935, "step": 7346 }, { "epoch": 2.3059358939150223, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 1.1487, "step": 7347 }, { "epoch": 2.3062497547961867, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 1.141, "step": 7348 }, { "epoch": 2.306563615677351, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 1.2915, "step": 7349 }, { "epoch": 2.3068774765585154, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.1457, "step": 7350 }, { "epoch": 2.3071913374396797, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 1.0873, "step": 7351 }, { "epoch": 2.307505198320844, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 0.9769, "step": 7352 }, { "epoch": 2.307819059202009, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 1.2883, "step": 7353 }, { "epoch": 2.3081329200831733, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.3833, "step": 7354 }, { "epoch": 2.3084467809643376, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 1.2168, "step": 7355 }, { "epoch": 2.308760641845502, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.3564, "step": 7356 }, { "epoch": 2.3090745027266664, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 1.5277, "step": 7357 }, { "epoch": 2.3093883636078307, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 1.4543, "step": 7358 }, { "epoch": 2.309702224488995, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 1.9339, "step": 7359 }, { "epoch": 2.31001608537016, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 1.7903, "step": 7360 }, { "epoch": 2.3103299462513243, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 1.324, "step": 7361 }, { "epoch": 2.3106438071324886, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 1.5345, "step": 7362 }, { "epoch": 2.310957668013653, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 1.8501, "step": 7363 }, { "epoch": 2.3112715288948174, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 1.4822, "step": 7364 }, { "epoch": 2.3115853897759817, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.565, "step": 7365 }, { "epoch": 2.311899250657146, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 1.6208, "step": 7366 }, { "epoch": 2.312213111538311, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 1.7077, "step": 7367 }, { "epoch": 2.3125269724194752, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 1.7671, "step": 7368 }, { "epoch": 2.3128408333006396, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 1.5872, "step": 7369 }, { "epoch": 2.313154694181804, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 1.4489, "step": 7370 }, { "epoch": 2.3134685550629683, "grad_norm": 0.90625, "learning_rate": 0.0002, "loss": 1.6627, "step": 7371 }, { "epoch": 2.3137824159441327, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 2.599, "step": 7372 }, { "epoch": 2.314096276825297, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.4697, "step": 7373 }, { "epoch": 2.3144101377064614, "grad_norm": 0.189453125, "learning_rate": 0.0002, "loss": 1.2035, "step": 7374 }, { "epoch": 2.314723998587626, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.3164, "step": 7375 }, { "epoch": 2.3150378594687906, "grad_norm": 0.24609375, "learning_rate": 0.0002, "loss": 1.2702, "step": 7376 }, { "epoch": 2.315351720349955, "grad_norm": 0.2138671875, "learning_rate": 0.0002, "loss": 1.1878, "step": 7377 }, { "epoch": 2.3156655812311193, "grad_norm": 0.24609375, "learning_rate": 0.0002, "loss": 1.2336, "step": 7378 }, { "epoch": 2.3159794421122837, "grad_norm": 0.234375, "learning_rate": 0.0002, "loss": 1.2553, "step": 7379 }, { "epoch": 2.316293302993448, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.1999, "step": 7380 }, { "epoch": 2.3166071638746124, "grad_norm": 0.2265625, "learning_rate": 0.0002, "loss": 1.1778, "step": 7381 }, { "epoch": 2.3169210247557768, "grad_norm": 0.2255859375, "learning_rate": 0.0002, "loss": 1.0516, "step": 7382 }, { "epoch": 2.3172348856369416, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.1528, "step": 7383 }, { "epoch": 2.317548746518106, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.3735, "step": 7384 }, { "epoch": 2.3178626073992703, "grad_norm": 0.244140625, "learning_rate": 0.0002, "loss": 1.2022, "step": 7385 }, { "epoch": 2.3181764682804347, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.3049, "step": 7386 }, { "epoch": 2.318490329161599, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.1096, "step": 7387 }, { "epoch": 2.3188041900427634, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 1.1619, "step": 7388 }, { "epoch": 2.3191180509239278, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.3097, "step": 7389 }, { "epoch": 2.3194319118050926, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.253, "step": 7390 }, { "epoch": 2.319745772686257, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.1523, "step": 7391 }, { "epoch": 2.3200596335674213, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.284, "step": 7392 }, { "epoch": 2.3203734944485856, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 1.2375, "step": 7393 }, { "epoch": 2.32068735532975, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 1.095, "step": 7394 }, { "epoch": 2.3210012162109144, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 0.9855, "step": 7395 }, { "epoch": 2.3213150770920787, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 1.144, "step": 7396 }, { "epoch": 2.3216289379732435, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 1.1119, "step": 7397 }, { "epoch": 2.321942798854408, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 1.1326, "step": 7398 }, { "epoch": 2.3222566597355723, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 1.0936, "step": 7399 }, { "epoch": 2.3225705206167366, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 1.4238, "step": 7400 }, { "epoch": 2.322884381497901, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.5229, "step": 7401 }, { "epoch": 2.3231982423790654, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.5519, "step": 7402 }, { "epoch": 2.3235121032602297, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.3097, "step": 7403 }, { "epoch": 2.3238259641413945, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 1.4459, "step": 7404 }, { "epoch": 2.324139825022559, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 1.7622, "step": 7405 }, { "epoch": 2.3244536859037233, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 1.5607, "step": 7406 }, { "epoch": 2.3247675467848876, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.6303, "step": 7407 }, { "epoch": 2.325081407666052, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 1.6072, "step": 7408 }, { "epoch": 2.3253952685472163, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 1.5817, "step": 7409 }, { "epoch": 2.3257091294283807, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 1.5388, "step": 7410 }, { "epoch": 2.3260229903095455, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 1.4529, "step": 7411 }, { "epoch": 2.32633685119071, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 1.5822, "step": 7412 }, { "epoch": 2.3266507120718742, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 2.0121, "step": 7413 }, { "epoch": 2.3269645729530386, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 1.5172, "step": 7414 }, { "epoch": 2.327278433834203, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 1.7323, "step": 7415 }, { "epoch": 2.3275922947153673, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 2.0126, "step": 7416 }, { "epoch": 2.3279061555965317, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 1.4165, "step": 7417 }, { "epoch": 2.3282200164776965, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 1.2381, "step": 7418 }, { "epoch": 2.328533877358861, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 1.2845, "step": 7419 }, { "epoch": 2.3288477382400252, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.3803, "step": 7420 }, { "epoch": 2.3291615991211896, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.509, "step": 7421 }, { "epoch": 2.329475460002354, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 1.7149, "step": 7422 }, { "epoch": 2.3297893208835183, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.6408, "step": 7423 }, { "epoch": 2.3301031817646827, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.3261, "step": 7424 }, { "epoch": 2.330417042645847, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.2783, "step": 7425 }, { "epoch": 2.330730903527012, "grad_norm": 0.2275390625, "learning_rate": 0.0002, "loss": 1.3528, "step": 7426 }, { "epoch": 2.331044764408176, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.4038, "step": 7427 }, { "epoch": 2.3313586252893406, "grad_norm": 0.2119140625, "learning_rate": 0.0002, "loss": 1.2779, "step": 7428 }, { "epoch": 2.331672486170505, "grad_norm": 0.220703125, "learning_rate": 0.0002, "loss": 1.5166, "step": 7429 }, { "epoch": 2.3319863470516693, "grad_norm": 0.22265625, "learning_rate": 0.0002, "loss": 1.1432, "step": 7430 }, { "epoch": 2.3323002079328337, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 1.2069, "step": 7431 }, { "epoch": 2.332614068813998, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.3675, "step": 7432 }, { "epoch": 2.3329279296951624, "grad_norm": 0.2314453125, "learning_rate": 0.0002, "loss": 1.2625, "step": 7433 }, { "epoch": 2.333241790576327, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.1656, "step": 7434 }, { "epoch": 2.3335556514574916, "grad_norm": 0.2275390625, "learning_rate": 0.0002, "loss": 1.0226, "step": 7435 }, { "epoch": 2.333869512338656, "grad_norm": 0.2392578125, "learning_rate": 0.0002, "loss": 1.1365, "step": 7436 }, { "epoch": 2.3341833732198203, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.1946, "step": 7437 }, { "epoch": 2.3344972341009846, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.247, "step": 7438 }, { "epoch": 2.334811094982149, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.1363, "step": 7439 }, { "epoch": 2.3351249558633134, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 1.1791, "step": 7440 }, { "epoch": 2.335438816744478, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.2446, "step": 7441 }, { "epoch": 2.3357526776256425, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 1.2902, "step": 7442 }, { "epoch": 2.336066538506807, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.1283, "step": 7443 }, { "epoch": 2.3363803993879713, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 1.1388, "step": 7444 }, { "epoch": 2.3366942602691356, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 1.4106, "step": 7445 }, { "epoch": 2.3370081211503, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 1.0115, "step": 7446 }, { "epoch": 2.3373219820314644, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.1647, "step": 7447 }, { "epoch": 2.337635842912629, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.2104, "step": 7448 }, { "epoch": 2.3379497037937935, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.393, "step": 7449 }, { "epoch": 2.338263564674958, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 1.2629, "step": 7450 }, { "epoch": 2.3385774255561222, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.2744, "step": 7451 }, { "epoch": 2.3388912864372866, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 1.3664, "step": 7452 }, { "epoch": 2.339205147318451, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.2567, "step": 7453 }, { "epoch": 2.3395190081996153, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 1.4276, "step": 7454 }, { "epoch": 2.33983286908078, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 1.4587, "step": 7455 }, { "epoch": 2.3401467299619445, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 1.423, "step": 7456 }, { "epoch": 2.340460590843109, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 1.3277, "step": 7457 }, { "epoch": 2.3407744517242732, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 1.2941, "step": 7458 }, { "epoch": 2.3410883126054376, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 1.5982, "step": 7459 }, { "epoch": 2.341402173486602, "grad_norm": 0.87109375, "learning_rate": 0.0002, "loss": 1.3236, "step": 7460 }, { "epoch": 2.3417160343677663, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 1.4465, "step": 7461 }, { "epoch": 2.342029895248931, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 1.789, "step": 7462 }, { "epoch": 2.3423437561300955, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 1.8546, "step": 7463 }, { "epoch": 2.34265761701126, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 1.9523, "step": 7464 }, { "epoch": 2.342971477892424, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 1.8784, "step": 7465 }, { "epoch": 2.3432853387735886, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 1.8553, "step": 7466 }, { "epoch": 2.343599199654753, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.4822, "step": 7467 }, { "epoch": 2.3439130605359173, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 1.4605, "step": 7468 }, { "epoch": 2.344226921417082, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 1.6513, "step": 7469 }, { "epoch": 2.3445407822982465, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 1.7872, "step": 7470 }, { "epoch": 2.344854643179411, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 1.4599, "step": 7471 }, { "epoch": 2.345168504060575, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 2.2205, "step": 7472 }, { "epoch": 2.3454823649417396, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 1.5578, "step": 7473 }, { "epoch": 2.345796225822904, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.1971, "step": 7474 }, { "epoch": 2.3461100867040683, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.2586, "step": 7475 }, { "epoch": 2.346423947585233, "grad_norm": 0.2109375, "learning_rate": 0.0002, "loss": 1.2479, "step": 7476 }, { "epoch": 2.3467378084663975, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.332, "step": 7477 }, { "epoch": 2.347051669347562, "grad_norm": 0.2470703125, "learning_rate": 0.0002, "loss": 1.1617, "step": 7478 }, { "epoch": 2.347365530228726, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.2061, "step": 7479 }, { "epoch": 2.3476793911098905, "grad_norm": 0.2392578125, "learning_rate": 0.0002, "loss": 1.2111, "step": 7480 }, { "epoch": 2.3476793911098905, "eval_loss": 1.8935425281524658, "eval_runtime": 123.4423, "eval_samples_per_second": 8.101, "eval_steps_per_second": 8.101, "step": 7480 }, { "epoch": 2.3476793911098905, "mmlu_eval_accuracy": 0.3774387542205415, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, "mmlu_eval_accuracy_clinical_knowledge": 0.3793103448275862, "mmlu_eval_accuracy_college_biology": 0.1875, "mmlu_eval_accuracy_college_chemistry": 0.25, "mmlu_eval_accuracy_college_computer_science": 0.2727272727272727, "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, "mmlu_eval_accuracy_college_medicine": 0.4090909090909091, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.2727272727272727, "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.3125, "mmlu_eval_accuracy_elementary_mathematics": 0.21951219512195122, "mmlu_eval_accuracy_formal_logic": 0.21428571428571427, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.3125, "mmlu_eval_accuracy_high_school_chemistry": 0.3181818181818182, "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556, "mmlu_eval_accuracy_high_school_european_history": 0.3333333333333333, "mmlu_eval_accuracy_high_school_geography": 0.5454545454545454, "mmlu_eval_accuracy_high_school_government_and_politics": 0.38095238095238093, "mmlu_eval_accuracy_high_school_macroeconomics": 0.4186046511627907, "mmlu_eval_accuracy_high_school_mathematics": 0.3103448275862069, "mmlu_eval_accuracy_high_school_microeconomics": 0.4230769230769231, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.5333333333333333, "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, "mmlu_eval_accuracy_high_school_us_history": 0.45454545454545453, "mmlu_eval_accuracy_high_school_world_history": 0.34615384615384615, "mmlu_eval_accuracy_human_aging": 0.5217391304347826, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.7692307692307693, "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, "mmlu_eval_accuracy_logical_fallacies": 0.5, "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, "mmlu_eval_accuracy_management": 0.45454545454545453, "mmlu_eval_accuracy_marketing": 0.68, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.5, "mmlu_eval_accuracy_moral_disputes": 0.5789473684210527, "mmlu_eval_accuracy_moral_scenarios": 0.32, "mmlu_eval_accuracy_nutrition": 0.5151515151515151, "mmlu_eval_accuracy_philosophy": 0.35294117647058826, "mmlu_eval_accuracy_prehistory": 0.2571428571428571, "mmlu_eval_accuracy_professional_accounting": 0.1935483870967742, "mmlu_eval_accuracy_professional_law": 0.3, "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, "mmlu_eval_accuracy_professional_psychology": 0.3333333333333333, "mmlu_eval_accuracy_public_relations": 0.5833333333333334, "mmlu_eval_accuracy_security_studies": 0.2222222222222222, "mmlu_eval_accuracy_sociology": 0.4090909090909091, "mmlu_eval_accuracy_us_foreign_policy": 0.45454545454545453, "mmlu_eval_accuracy_virology": 0.2777777777777778, "mmlu_eval_accuracy_world_religions": 0.47368421052631576, "mmlu_loss": 1.1910356640776802, "step": 7480 }, { "epoch": 2.347993251991055, "grad_norm": 0.2392578125, "learning_rate": 0.0002, "loss": 1.1232, "step": 7481 }, { "epoch": 2.3483071128722193, "grad_norm": 0.2119140625, "learning_rate": 0.0002, "loss": 1.213, "step": 7482 }, { "epoch": 2.3486209737533836, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.1983, "step": 7483 }, { "epoch": 2.348934834634548, "grad_norm": 0.2392578125, "learning_rate": 0.0002, "loss": 1.2784, "step": 7484 }, { "epoch": 2.349248695515713, "grad_norm": 0.2490234375, "learning_rate": 0.0002, "loss": 1.1297, "step": 7485 }, { "epoch": 2.349562556396877, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.1954, "step": 7486 }, { "epoch": 2.3498764172780415, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.2602, "step": 7487 }, { "epoch": 2.350190278159206, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.2229, "step": 7488 }, { "epoch": 2.3505041390403703, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.2742, "step": 7489 }, { "epoch": 2.3508179999215346, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.1714, "step": 7490 }, { "epoch": 2.351131860802699, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.079, "step": 7491 }, { "epoch": 2.351445721683864, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.1073, "step": 7492 }, { "epoch": 2.351759582565028, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 1.121, "step": 7493 }, { "epoch": 2.3520734434461925, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 1.3863, "step": 7494 }, { "epoch": 2.352387304327357, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.0825, "step": 7495 }, { "epoch": 2.3527011652085212, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 1.2917, "step": 7496 }, { "epoch": 2.3530150260896856, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 1.1381, "step": 7497 }, { "epoch": 2.35332888697085, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 1.4117, "step": 7498 }, { "epoch": 2.3536427478520148, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 1.2255, "step": 7499 }, { "epoch": 2.353956608733179, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 1.351, "step": 7500 }, { "epoch": 2.3542704696143435, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.1369, "step": 7501 }, { "epoch": 2.354584330495508, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.3572, "step": 7502 }, { "epoch": 2.3548981913766722, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 1.4492, "step": 7503 }, { "epoch": 2.3552120522578366, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.4165, "step": 7504 }, { "epoch": 2.355525913139001, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 1.3938, "step": 7505 }, { "epoch": 2.3558397740201658, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 1.5443, "step": 7506 }, { "epoch": 2.35615363490133, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 1.513, "step": 7507 }, { "epoch": 2.3564674957824945, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 1.4569, "step": 7508 }, { "epoch": 2.356781356663659, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 2.0693, "step": 7509 }, { "epoch": 2.357095217544823, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 1.4543, "step": 7510 }, { "epoch": 2.3574090784259876, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 1.6564, "step": 7511 }, { "epoch": 2.357722939307152, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 1.4297, "step": 7512 }, { "epoch": 2.3580368001883167, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 1.2009, "step": 7513 }, { "epoch": 2.358350661069481, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 1.4381, "step": 7514 }, { "epoch": 2.3586645219506455, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 1.6546, "step": 7515 }, { "epoch": 2.35897838283181, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 1.5741, "step": 7516 }, { "epoch": 2.359292243712974, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 1.8156, "step": 7517 }, { "epoch": 2.3596061045941386, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 1.6448, "step": 7518 }, { "epoch": 2.359919965475303, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 1.2672, "step": 7519 }, { "epoch": 2.3602338263564677, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.348, "step": 7520 }, { "epoch": 2.360547687237632, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.6876, "step": 7521 }, { "epoch": 2.3608615481187964, "grad_norm": 1.0703125, "learning_rate": 0.0002, "loss": 2.4533, "step": 7522 }, { "epoch": 2.361175408999961, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 1.5787, "step": 7523 }, { "epoch": 2.361489269881125, "grad_norm": 0.1982421875, "learning_rate": 0.0002, "loss": 1.4307, "step": 7524 }, { "epoch": 2.3618031307622895, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.2291, "step": 7525 }, { "epoch": 2.362116991643454, "grad_norm": 0.193359375, "learning_rate": 0.0002, "loss": 1.2793, "step": 7526 }, { "epoch": 2.3624308525246187, "grad_norm": 0.2490234375, "learning_rate": 0.0002, "loss": 1.2511, "step": 7527 }, { "epoch": 2.362744713405783, "grad_norm": 0.236328125, "learning_rate": 0.0002, "loss": 1.2715, "step": 7528 }, { "epoch": 2.3630585742869474, "grad_norm": 0.236328125, "learning_rate": 0.0002, "loss": 1.2203, "step": 7529 }, { "epoch": 2.363372435168112, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 1.3121, "step": 7530 }, { "epoch": 2.363686296049276, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.2616, "step": 7531 }, { "epoch": 2.3640001569304405, "grad_norm": 0.234375, "learning_rate": 0.0002, "loss": 1.1409, "step": 7532 }, { "epoch": 2.364314017811605, "grad_norm": 0.2333984375, "learning_rate": 0.0002, "loss": 1.127, "step": 7533 }, { "epoch": 2.3646278786927692, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.2468, "step": 7534 }, { "epoch": 2.3649417395739336, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.2356, "step": 7535 }, { "epoch": 2.3652556004550984, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.2608, "step": 7536 }, { "epoch": 2.365569461336263, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.2708, "step": 7537 }, { "epoch": 2.365883322217427, "grad_norm": 0.2236328125, "learning_rate": 0.0002, "loss": 0.9963, "step": 7538 }, { "epoch": 2.3661971830985915, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.2078, "step": 7539 }, { "epoch": 2.366511043979756, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.1852, "step": 7540 }, { "epoch": 2.3668249048609202, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.2221, "step": 7541 }, { "epoch": 2.3671387657420846, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.169, "step": 7542 }, { "epoch": 2.3674526266232494, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.0417, "step": 7543 }, { "epoch": 2.3677664875044138, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.2473, "step": 7544 }, { "epoch": 2.368080348385578, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 1.2241, "step": 7545 }, { "epoch": 2.3683942092667425, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 1.0858, "step": 7546 }, { "epoch": 2.368708070147907, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 1.1838, "step": 7547 }, { "epoch": 2.369021931029071, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 1.2032, "step": 7548 }, { "epoch": 2.3693357919102356, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 1.2844, "step": 7549 }, { "epoch": 2.3696496527914004, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 1.4577, "step": 7550 }, { "epoch": 2.3699635136725647, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 1.3346, "step": 7551 }, { "epoch": 2.370277374553729, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 1.4026, "step": 7552 }, { "epoch": 2.3705912354348935, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 1.2713, "step": 7553 }, { "epoch": 2.370905096316058, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.3758, "step": 7554 }, { "epoch": 2.371218957197222, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.4112, "step": 7555 }, { "epoch": 2.3715328180783866, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 1.5466, "step": 7556 }, { "epoch": 2.3718466789595514, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.4977, "step": 7557 }, { "epoch": 2.3721605398407157, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 1.4294, "step": 7558 }, { "epoch": 2.37247440072188, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.338, "step": 7559 }, { "epoch": 2.3727882616030445, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 1.8784, "step": 7560 }, { "epoch": 2.373102122484209, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 1.5111, "step": 7561 }, { "epoch": 2.373415983365373, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 1.8523, "step": 7562 }, { "epoch": 2.3737298442465375, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 1.5381, "step": 7563 }, { "epoch": 2.3740437051277024, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.4634, "step": 7564 }, { "epoch": 2.3743575660088667, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 1.4628, "step": 7565 }, { "epoch": 2.374671426890031, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 1.8217, "step": 7566 }, { "epoch": 2.3749852877711954, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 1.9564, "step": 7567 }, { "epoch": 2.37529914865236, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 1.5601, "step": 7568 }, { "epoch": 2.375613009533524, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.2211, "step": 7569 }, { "epoch": 2.3759268704146885, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 1.2683, "step": 7570 }, { "epoch": 2.3762407312958533, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 1.6888, "step": 7571 }, { "epoch": 2.3765545921770177, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 2.1861, "step": 7572 }, { "epoch": 2.376868453058182, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.6138, "step": 7573 }, { "epoch": 2.3771823139393464, "grad_norm": 0.2099609375, "learning_rate": 0.0002, "loss": 1.489, "step": 7574 }, { "epoch": 2.377496174820511, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.2304, "step": 7575 }, { "epoch": 2.377810035701675, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.4376, "step": 7576 }, { "epoch": 2.3781238965828395, "grad_norm": 0.216796875, "learning_rate": 0.0002, "loss": 1.3264, "step": 7577 }, { "epoch": 2.3784377574640043, "grad_norm": 0.2275390625, "learning_rate": 0.0002, "loss": 1.3431, "step": 7578 }, { "epoch": 2.3787516183451687, "grad_norm": 0.236328125, "learning_rate": 0.0002, "loss": 1.3488, "step": 7579 }, { "epoch": 2.379065479226333, "grad_norm": 0.240234375, "learning_rate": 0.0002, "loss": 1.358, "step": 7580 }, { "epoch": 2.3793793401074974, "grad_norm": 0.22265625, "learning_rate": 0.0002, "loss": 1.282, "step": 7581 }, { "epoch": 2.3796932009886618, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 1.1545, "step": 7582 }, { "epoch": 2.380007061869826, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.2513, "step": 7583 }, { "epoch": 2.3803209227509905, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.2063, "step": 7584 }, { "epoch": 2.380634783632155, "grad_norm": 0.2275390625, "learning_rate": 0.0002, "loss": 1.1785, "step": 7585 }, { "epoch": 2.3809486445133192, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.159, "step": 7586 }, { "epoch": 2.381262505394484, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.3261, "step": 7587 }, { "epoch": 2.3815763662756484, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.1649, "step": 7588 }, { "epoch": 2.3818902271568128, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.2118, "step": 7589 }, { "epoch": 2.382204088037977, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.2561, "step": 7590 }, { "epoch": 2.3825179489191415, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.1133, "step": 7591 }, { "epoch": 2.382831809800306, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.3717, "step": 7592 }, { "epoch": 2.38314567068147, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.0774, "step": 7593 }, { "epoch": 2.383459531562635, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 1.2753, "step": 7594 }, { "epoch": 2.3837733924437994, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 1.3099, "step": 7595 }, { "epoch": 2.3840872533249637, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 1.2474, "step": 7596 }, { "epoch": 2.384401114206128, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.1762, "step": 7597 }, { "epoch": 2.3847149750872925, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.0864, "step": 7598 }, { "epoch": 2.385028835968457, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 1.2909, "step": 7599 }, { "epoch": 2.385342696849621, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 1.1975, "step": 7600 }, { "epoch": 2.385656557730786, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 1.1689, "step": 7601 }, { "epoch": 2.3859704186119504, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 1.4181, "step": 7602 }, { "epoch": 2.3862842794931147, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.4699, "step": 7603 }, { "epoch": 2.386598140374279, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 1.4413, "step": 7604 }, { "epoch": 2.3869120012554434, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 1.3046, "step": 7605 }, { "epoch": 2.387225862136608, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 1.4347, "step": 7606 }, { "epoch": 2.387539723017772, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 1.5458, "step": 7607 }, { "epoch": 2.387853583898937, "grad_norm": 1.0703125, "learning_rate": 0.0002, "loss": 2.0128, "step": 7608 }, { "epoch": 2.3881674447801013, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 1.4937, "step": 7609 }, { "epoch": 2.3884813056612657, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 1.6072, "step": 7610 }, { "epoch": 2.38879516654243, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 1.7504, "step": 7611 }, { "epoch": 2.3891090274235944, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 1.6268, "step": 7612 }, { "epoch": 2.389422888304759, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 1.8109, "step": 7613 }, { "epoch": 2.389736749185923, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.383, "step": 7614 }, { "epoch": 2.390050610067088, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 1.8421, "step": 7615 }, { "epoch": 2.3903644709482523, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 1.9172, "step": 7616 }, { "epoch": 2.3906783318294167, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.1838, "step": 7617 }, { "epoch": 2.390992192710581, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.6015, "step": 7618 }, { "epoch": 2.3913060535917454, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 1.7481, "step": 7619 }, { "epoch": 2.39161991447291, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 1.6861, "step": 7620 }, { "epoch": 2.391933775354074, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.4042, "step": 7621 }, { "epoch": 2.392247636235239, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.8288, "step": 7622 }, { "epoch": 2.3925614971164033, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 1.7302, "step": 7623 }, { "epoch": 2.3928753579975677, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.2854, "step": 7624 }, { "epoch": 2.393189218878732, "grad_norm": 0.1865234375, "learning_rate": 0.0002, "loss": 1.1566, "step": 7625 }, { "epoch": 2.3935030797598964, "grad_norm": 0.2236328125, "learning_rate": 0.0002, "loss": 1.3792, "step": 7626 }, { "epoch": 2.3938169406410608, "grad_norm": 0.1875, "learning_rate": 0.0002, "loss": 1.19, "step": 7627 }, { "epoch": 2.394130801522225, "grad_norm": 0.224609375, "learning_rate": 0.0002, "loss": 1.4167, "step": 7628 }, { "epoch": 2.39444466240339, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.3733, "step": 7629 }, { "epoch": 2.3947585232845543, "grad_norm": 0.2373046875, "learning_rate": 0.0002, "loss": 1.1987, "step": 7630 }, { "epoch": 2.3950723841657187, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 1.3062, "step": 7631 }, { "epoch": 2.395386245046883, "grad_norm": 0.2265625, "learning_rate": 0.0002, "loss": 1.2396, "step": 7632 }, { "epoch": 2.3957001059280474, "grad_norm": 0.2470703125, "learning_rate": 0.0002, "loss": 1.2648, "step": 7633 }, { "epoch": 2.3960139668092117, "grad_norm": 0.23828125, "learning_rate": 0.0002, "loss": 1.2623, "step": 7634 }, { "epoch": 2.396327827690376, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.3733, "step": 7635 }, { "epoch": 2.3966416885715405, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.2008, "step": 7636 }, { "epoch": 2.3969555494527053, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.346, "step": 7637 }, { "epoch": 2.3972694103338696, "grad_norm": 0.2451171875, "learning_rate": 0.0002, "loss": 1.1256, "step": 7638 }, { "epoch": 2.397583271215034, "grad_norm": 0.2333984375, "learning_rate": 0.0002, "loss": 1.0135, "step": 7639 }, { "epoch": 2.3978971320961984, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.3414, "step": 7640 }, { "epoch": 2.3982109929773627, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.1485, "step": 7641 }, { "epoch": 2.398524853858527, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.1127, "step": 7642 }, { "epoch": 2.3988387147396915, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 1.1797, "step": 7643 }, { "epoch": 2.399152575620856, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.1985, "step": 7644 }, { "epoch": 2.3994664365020206, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 1.046, "step": 7645 }, { "epoch": 2.399780297383185, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.336, "step": 7646 }, { "epoch": 2.4000941582643494, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 1.2497, "step": 7647 }, { "epoch": 2.4004080191455137, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 1.3432, "step": 7648 }, { "epoch": 2.400721880026678, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 1.17, "step": 7649 }, { "epoch": 2.4010357409078424, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 1.1773, "step": 7650 }, { "epoch": 2.401349601789007, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.1109, "step": 7651 }, { "epoch": 2.4016634626701716, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.3537, "step": 7652 }, { "epoch": 2.401977323551336, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 1.1591, "step": 7653 }, { "epoch": 2.4022911844325003, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.4013, "step": 7654 }, { "epoch": 2.4026050453136647, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 1.6239, "step": 7655 }, { "epoch": 2.402918906194829, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 1.8728, "step": 7656 }, { "epoch": 2.4032327670759934, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 1.4437, "step": 7657 }, { "epoch": 2.403546627957158, "grad_norm": 0.8671875, "learning_rate": 0.0002, "loss": 2.0085, "step": 7658 }, { "epoch": 2.4038604888383226, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 1.9269, "step": 7659 }, { "epoch": 2.404174349719487, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.3327, "step": 7660 }, { "epoch": 2.4044882106006513, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 1.8981, "step": 7661 }, { "epoch": 2.4048020714818157, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 1.319, "step": 7662 }, { "epoch": 2.40511593236298, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 1.4913, "step": 7663 }, { "epoch": 2.4054297932441444, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 1.9475, "step": 7664 }, { "epoch": 2.4057436541253088, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 1.7267, "step": 7665 }, { "epoch": 2.4060575150064736, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 1.3683, "step": 7666 }, { "epoch": 2.406371375887638, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 1.7277, "step": 7667 }, { "epoch": 2.406371375887638, "eval_loss": 1.8807933330535889, "eval_runtime": 123.1847, "eval_samples_per_second": 8.118, "eval_steps_per_second": 8.118, "step": 7667 }, { "epoch": 2.406371375887638, "mmlu_eval_accuracy": 0.41309661099491723, "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.3125, "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, "mmlu_eval_accuracy_clinical_knowledge": 0.41379310344827586, "mmlu_eval_accuracy_college_biology": 0.25, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.45454545454545453, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.2727272727272727, "mmlu_eval_accuracy_conceptual_physics": 0.38461538461538464, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.25, "mmlu_eval_accuracy_elementary_mathematics": 0.1951219512195122, "mmlu_eval_accuracy_formal_logic": 0.2857142857142857, "mmlu_eval_accuracy_global_facts": 0.5, "mmlu_eval_accuracy_high_school_biology": 0.34375, "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556, "mmlu_eval_accuracy_high_school_european_history": 0.6111111111111112, "mmlu_eval_accuracy_high_school_geography": 0.5909090909090909, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5714285714285714, "mmlu_eval_accuracy_high_school_macroeconomics": 0.4186046511627907, "mmlu_eval_accuracy_high_school_mathematics": 0.2413793103448276, "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.65, "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, "mmlu_eval_accuracy_high_school_us_history": 0.6363636363636364, "mmlu_eval_accuracy_high_school_world_history": 0.4230769230769231, "mmlu_eval_accuracy_human_aging": 0.6956521739130435, "mmlu_eval_accuracy_human_sexuality": 0.25, "mmlu_eval_accuracy_international_law": 0.7692307692307693, "mmlu_eval_accuracy_jurisprudence": 0.45454545454545453, "mmlu_eval_accuracy_logical_fallacies": 0.4444444444444444, "mmlu_eval_accuracy_machine_learning": 0.18181818181818182, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.8, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.5232558139534884, "mmlu_eval_accuracy_moral_disputes": 0.5263157894736842, "mmlu_eval_accuracy_moral_scenarios": 0.23, "mmlu_eval_accuracy_nutrition": 0.42424242424242425, "mmlu_eval_accuracy_philosophy": 0.4117647058823529, "mmlu_eval_accuracy_prehistory": 0.34285714285714286, "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, "mmlu_eval_accuracy_professional_law": 0.31176470588235294, "mmlu_eval_accuracy_professional_medicine": 0.3870967741935484, "mmlu_eval_accuracy_professional_psychology": 0.42028985507246375, "mmlu_eval_accuracy_public_relations": 0.5833333333333334, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.45454545454545453, "mmlu_eval_accuracy_us_foreign_policy": 0.5454545454545454, "mmlu_eval_accuracy_virology": 0.3333333333333333, "mmlu_eval_accuracy_world_religions": 0.5263157894736842, "mmlu_loss": 1.1642392259516956, "step": 7667 }, { "epoch": 2.4066852367688023, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 1.3319, "step": 7668 }, { "epoch": 2.4069990976499667, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 1.5124, "step": 7669 }, { "epoch": 2.407312958531131, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 1.7205, "step": 7670 }, { "epoch": 2.4076268194122954, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 1.8737, "step": 7671 }, { "epoch": 2.4079406802934598, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 2.1204, "step": 7672 }, { "epoch": 2.4082545411746246, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.6131, "step": 7673 }, { "epoch": 2.408568402055789, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.1128, "step": 7674 }, { "epoch": 2.4088822629369533, "grad_norm": 0.2021484375, "learning_rate": 0.0002, "loss": 1.3572, "step": 7675 }, { "epoch": 2.4091961238181177, "grad_norm": 0.2275390625, "learning_rate": 0.0002, "loss": 1.3329, "step": 7676 }, { "epoch": 2.409509984699282, "grad_norm": 0.224609375, "learning_rate": 0.0002, "loss": 1.4067, "step": 7677 }, { "epoch": 2.4098238455804464, "grad_norm": 0.2138671875, "learning_rate": 0.0002, "loss": 1.2161, "step": 7678 }, { "epoch": 2.4101377064616107, "grad_norm": 0.2236328125, "learning_rate": 0.0002, "loss": 1.2445, "step": 7679 }, { "epoch": 2.4104515673427755, "grad_norm": 0.2333984375, "learning_rate": 0.0002, "loss": 1.3158, "step": 7680 }, { "epoch": 2.41076542822394, "grad_norm": 0.23828125, "learning_rate": 0.0002, "loss": 1.3432, "step": 7681 }, { "epoch": 2.4110792891051043, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.1417, "step": 7682 }, { "epoch": 2.4113931499862686, "grad_norm": 0.232421875, "learning_rate": 0.0002, "loss": 1.3482, "step": 7683 }, { "epoch": 2.411707010867433, "grad_norm": 0.2275390625, "learning_rate": 0.0002, "loss": 1.1931, "step": 7684 }, { "epoch": 2.4120208717485974, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.325, "step": 7685 }, { "epoch": 2.4123347326297617, "grad_norm": 0.2392578125, "learning_rate": 0.0002, "loss": 1.2304, "step": 7686 }, { "epoch": 2.4126485935109265, "grad_norm": 0.234375, "learning_rate": 0.0002, "loss": 1.0299, "step": 7687 }, { "epoch": 2.412962454392091, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.1227, "step": 7688 }, { "epoch": 2.4132763152732553, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.0972, "step": 7689 }, { "epoch": 2.4135901761544196, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.1396, "step": 7690 }, { "epoch": 2.413904037035584, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.2557, "step": 7691 }, { "epoch": 2.4142178979167483, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.0896, "step": 7692 }, { "epoch": 2.4145317587979127, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.1399, "step": 7693 }, { "epoch": 2.414845619679077, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.0606, "step": 7694 }, { "epoch": 2.4151594805602414, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 1.3909, "step": 7695 }, { "epoch": 2.4154733414414062, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.2732, "step": 7696 }, { "epoch": 2.4157872023225706, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.2149, "step": 7697 }, { "epoch": 2.416101063203735, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 1.2243, "step": 7698 }, { "epoch": 2.4164149240848993, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 1.3006, "step": 7699 }, { "epoch": 2.4167287849660637, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 1.1579, "step": 7700 }, { "epoch": 2.417042645847228, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.2826, "step": 7701 }, { "epoch": 2.4173565067283924, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 1.4205, "step": 7702 }, { "epoch": 2.4176703676095572, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 1.3593, "step": 7703 }, { "epoch": 2.4179842284907216, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.3245, "step": 7704 }, { "epoch": 2.418298089371886, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.3828, "step": 7705 }, { "epoch": 2.4186119502530503, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 1.7869, "step": 7706 }, { "epoch": 2.4189258111342147, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 1.4622, "step": 7707 }, { "epoch": 2.419239672015379, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 1.5695, "step": 7708 }, { "epoch": 2.4195535328965434, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 1.482, "step": 7709 }, { "epoch": 2.419867393777708, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 1.6119, "step": 7710 }, { "epoch": 2.4201812546588726, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.5564, "step": 7711 }, { "epoch": 2.420495115540037, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 2.0089, "step": 7712 }, { "epoch": 2.4208089764212013, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 1.567, "step": 7713 }, { "epoch": 2.4211228373023657, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 2.0211, "step": 7714 }, { "epoch": 2.42143669818353, "grad_norm": 0.8515625, "learning_rate": 0.0002, "loss": 1.5029, "step": 7715 }, { "epoch": 2.4217505590646944, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 2.0539, "step": 7716 }, { "epoch": 2.422064419945859, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 1.7136, "step": 7717 }, { "epoch": 2.4223782808270236, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 1.4389, "step": 7718 }, { "epoch": 2.422692141708188, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.5873, "step": 7719 }, { "epoch": 2.4230060025893523, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 1.3931, "step": 7720 }, { "epoch": 2.4233198634705166, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 1.6665, "step": 7721 }, { "epoch": 2.423633724351681, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 2.0482, "step": 7722 }, { "epoch": 2.4239475852328454, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.5806, "step": 7723 }, { "epoch": 2.42426144611401, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.2527, "step": 7724 }, { "epoch": 2.4245753069951745, "grad_norm": 0.1943359375, "learning_rate": 0.0002, "loss": 1.2892, "step": 7725 }, { "epoch": 2.424889167876339, "grad_norm": 0.19921875, "learning_rate": 0.0002, "loss": 1.307, "step": 7726 }, { "epoch": 2.4252030287575033, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.3538, "step": 7727 }, { "epoch": 2.4255168896386676, "grad_norm": 0.244140625, "learning_rate": 0.0002, "loss": 1.2927, "step": 7728 }, { "epoch": 2.425830750519832, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.2561, "step": 7729 }, { "epoch": 2.4261446114009964, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.3518, "step": 7730 }, { "epoch": 2.426458472282161, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.2576, "step": 7731 }, { "epoch": 2.4267723331633255, "grad_norm": 0.2333984375, "learning_rate": 0.0002, "loss": 1.191, "step": 7732 }, { "epoch": 2.42708619404449, "grad_norm": 0.224609375, "learning_rate": 0.0002, "loss": 1.2097, "step": 7733 }, { "epoch": 2.4274000549256542, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.36, "step": 7734 }, { "epoch": 2.4277139158068186, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.3429, "step": 7735 }, { "epoch": 2.428027776687983, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 1.1519, "step": 7736 }, { "epoch": 2.4283416375691473, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.14, "step": 7737 }, { "epoch": 2.428655498450312, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.2666, "step": 7738 }, { "epoch": 2.4289693593314765, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.2836, "step": 7739 }, { "epoch": 2.429283220212641, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 1.0682, "step": 7740 }, { "epoch": 2.4295970810938052, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.0658, "step": 7741 }, { "epoch": 2.4299109419749696, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.0664, "step": 7742 }, { "epoch": 2.430224802856134, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.202, "step": 7743 }, { "epoch": 2.4305386637372983, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.1887, "step": 7744 }, { "epoch": 2.4308525246184627, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 1.1607, "step": 7745 }, { "epoch": 2.431166385499627, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.1466, "step": 7746 }, { "epoch": 2.431480246380792, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 1.2324, "step": 7747 }, { "epoch": 2.431794107261956, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 1.1999, "step": 7748 }, { "epoch": 2.4321079681431206, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 1.1594, "step": 7749 }, { "epoch": 2.432421829024285, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 1.2988, "step": 7750 }, { "epoch": 2.4327356899054493, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 1.6062, "step": 7751 }, { "epoch": 2.4330495507866137, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 1.1449, "step": 7752 }, { "epoch": 2.433363411667778, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.6523, "step": 7753 }, { "epoch": 2.433677272548943, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 1.2602, "step": 7754 }, { "epoch": 2.433991133430107, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 1.7511, "step": 7755 }, { "epoch": 2.4343049943112716, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 1.275, "step": 7756 }, { "epoch": 2.434618855192436, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 1.7591, "step": 7757 }, { "epoch": 2.4349327160736003, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.526, "step": 7758 }, { "epoch": 2.4352465769547647, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 1.5648, "step": 7759 }, { "epoch": 2.435560437835929, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 1.6415, "step": 7760 }, { "epoch": 2.435874298717094, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 1.7678, "step": 7761 }, { "epoch": 2.436188159598258, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.5692, "step": 7762 }, { "epoch": 2.4365020204794225, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 1.4177, "step": 7763 }, { "epoch": 2.436815881360587, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 1.6958, "step": 7764 }, { "epoch": 2.4371297422417513, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 1.8524, "step": 7765 }, { "epoch": 2.4374436031229156, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.3297, "step": 7766 }, { "epoch": 2.43775746400408, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 1.6228, "step": 7767 }, { "epoch": 2.438071324885245, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 1.4021, "step": 7768 }, { "epoch": 2.438385185766409, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.2827, "step": 7769 }, { "epoch": 2.4386990466475735, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 1.8537, "step": 7770 }, { "epoch": 2.439012907528738, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 1.3937, "step": 7771 }, { "epoch": 2.4393267684099023, "grad_norm": 1.046875, "learning_rate": 0.0002, "loss": 2.5302, "step": 7772 }, { "epoch": 2.4396406292910666, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.5574, "step": 7773 }, { "epoch": 2.439954490172231, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.2218, "step": 7774 }, { "epoch": 2.440268351053396, "grad_norm": 0.1923828125, "learning_rate": 0.0002, "loss": 1.1567, "step": 7775 }, { "epoch": 2.44058221193456, "grad_norm": 0.2099609375, "learning_rate": 0.0002, "loss": 1.2398, "step": 7776 }, { "epoch": 2.4408960728157245, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.2381, "step": 7777 }, { "epoch": 2.441209933696889, "grad_norm": 0.2373046875, "learning_rate": 0.0002, "loss": 1.2362, "step": 7778 }, { "epoch": 2.4415237945780532, "grad_norm": 0.244140625, "learning_rate": 0.0002, "loss": 1.2873, "step": 7779 }, { "epoch": 2.4418376554592176, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.3658, "step": 7780 }, { "epoch": 2.442151516340382, "grad_norm": 0.2373046875, "learning_rate": 0.0002, "loss": 1.2976, "step": 7781 }, { "epoch": 2.4424653772215468, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 1.1607, "step": 7782 }, { "epoch": 2.442779238102711, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 1.3697, "step": 7783 }, { "epoch": 2.4430930989838755, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.3822, "step": 7784 }, { "epoch": 2.44340695986504, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.0846, "step": 7785 }, { "epoch": 2.4437208207462042, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.2304, "step": 7786 }, { "epoch": 2.4440346816273686, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.289, "step": 7787 }, { "epoch": 2.444348542508533, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.2578, "step": 7788 }, { "epoch": 2.4446624033896978, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.2021, "step": 7789 }, { "epoch": 2.444976264270862, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.1785, "step": 7790 }, { "epoch": 2.4452901251520265, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 1.2989, "step": 7791 }, { "epoch": 2.445603986033191, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 1.1449, "step": 7792 }, { "epoch": 2.445917846914355, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.4418, "step": 7793 }, { "epoch": 2.4462317077955196, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 1.153, "step": 7794 }, { "epoch": 2.446545568676684, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 1.2507, "step": 7795 }, { "epoch": 2.4468594295578483, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 1.1606, "step": 7796 }, { "epoch": 2.4471732904390127, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.2137, "step": 7797 }, { "epoch": 2.4474871513201775, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 1.2885, "step": 7798 }, { "epoch": 2.447801012201342, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 1.5443, "step": 7799 }, { "epoch": 2.448114873082506, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 1.2144, "step": 7800 }, { "epoch": 2.4484287339636706, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 1.3909, "step": 7801 }, { "epoch": 2.448742594844835, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 1.37, "step": 7802 }, { "epoch": 2.4490564557259993, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 1.2949, "step": 7803 }, { "epoch": 2.4493703166071636, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 1.7602, "step": 7804 }, { "epoch": 2.4496841774883285, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.4979, "step": 7805 }, { "epoch": 2.449998038369493, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 1.8162, "step": 7806 }, { "epoch": 2.450311899250657, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 1.7862, "step": 7807 }, { "epoch": 2.4506257601318215, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 1.7346, "step": 7808 }, { "epoch": 2.450939621012986, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 1.6698, "step": 7809 }, { "epoch": 2.4512534818941503, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 1.6021, "step": 7810 }, { "epoch": 2.4515673427753146, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 1.5084, "step": 7811 }, { "epoch": 2.4518812036564794, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 1.7023, "step": 7812 }, { "epoch": 2.452195064537644, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 1.7789, "step": 7813 }, { "epoch": 2.452508925418808, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 1.6994, "step": 7814 }, { "epoch": 2.4528227862999725, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.6418, "step": 7815 }, { "epoch": 2.453136647181137, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 1.5241, "step": 7816 }, { "epoch": 2.4534505080623012, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 1.5741, "step": 7817 }, { "epoch": 2.4537643689434656, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 1.3351, "step": 7818 }, { "epoch": 2.4540782298246304, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 1.2748, "step": 7819 }, { "epoch": 2.454392090705795, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 1.4102, "step": 7820 }, { "epoch": 2.454705951586959, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 1.4747, "step": 7821 }, { "epoch": 2.4550198124681235, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 1.8248, "step": 7822 }, { "epoch": 2.455333673349288, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 1.6179, "step": 7823 }, { "epoch": 2.4556475342304522, "grad_norm": 0.220703125, "learning_rate": 0.0002, "loss": 1.2985, "step": 7824 }, { "epoch": 2.4559613951116166, "grad_norm": 0.2001953125, "learning_rate": 0.0002, "loss": 1.3181, "step": 7825 }, { "epoch": 2.4562752559927814, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.4005, "step": 7826 }, { "epoch": 2.4565891168739458, "grad_norm": 0.2333984375, "learning_rate": 0.0002, "loss": 1.236, "step": 7827 }, { "epoch": 2.45690297775511, "grad_norm": 0.2470703125, "learning_rate": 0.0002, "loss": 1.3331, "step": 7828 }, { "epoch": 2.4572168386362745, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.2984, "step": 7829 }, { "epoch": 2.457530699517439, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.2279, "step": 7830 }, { "epoch": 2.457844560398603, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.1096, "step": 7831 }, { "epoch": 2.4581584212797676, "grad_norm": 0.228515625, "learning_rate": 0.0002, "loss": 1.1899, "step": 7832 }, { "epoch": 2.4584722821609324, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 1.2368, "step": 7833 }, { "epoch": 2.4587861430420968, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.2286, "step": 7834 }, { "epoch": 2.459100003923261, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 1.228, "step": 7835 }, { "epoch": 2.4594138648044255, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.2755, "step": 7836 }, { "epoch": 2.45972772568559, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.2708, "step": 7837 }, { "epoch": 2.460041586566754, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.1981, "step": 7838 }, { "epoch": 2.4603554474479186, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.2338, "step": 7839 }, { "epoch": 2.4606693083290834, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.2709, "step": 7840 }, { "epoch": 2.4609831692102477, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.1224, "step": 7841 }, { "epoch": 2.461297030091412, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 1.2273, "step": 7842 }, { "epoch": 2.4616108909725765, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 1.1939, "step": 7843 }, { "epoch": 2.461924751853741, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 1.2023, "step": 7844 }, { "epoch": 2.462238612734905, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 1.2888, "step": 7845 }, { "epoch": 2.4625524736160695, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 1.1506, "step": 7846 }, { "epoch": 2.462866334497234, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.0527, "step": 7847 }, { "epoch": 2.4631801953783987, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.0532, "step": 7848 }, { "epoch": 2.463494056259563, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.1884, "step": 7849 }, { "epoch": 2.4638079171407274, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 1.3366, "step": 7850 }, { "epoch": 2.464121778021892, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 1.4498, "step": 7851 }, { "epoch": 2.464435638903056, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 1.3936, "step": 7852 }, { "epoch": 2.4647494997842205, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 1.3656, "step": 7853 }, { "epoch": 2.465063360665385, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 1.3649, "step": 7854 }, { "epoch": 2.465063360665385, "eval_loss": 1.8672763109207153, "eval_runtime": 123.3878, "eval_samples_per_second": 8.105, "eval_steps_per_second": 8.105, "step": 7854 }, { "epoch": 2.465063360665385, "mmlu_eval_accuracy": 0.4157366111016451, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.3125, "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, "mmlu_eval_accuracy_clinical_knowledge": 0.4482758620689655, "mmlu_eval_accuracy_college_biology": 0.25, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.2727272727272727, "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.3125, "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, "mmlu_eval_accuracy_formal_logic": 0.35714285714285715, "mmlu_eval_accuracy_global_facts": 0.6, "mmlu_eval_accuracy_high_school_biology": 0.3125, "mmlu_eval_accuracy_high_school_chemistry": 0.3181818181818182, "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556, "mmlu_eval_accuracy_high_school_european_history": 0.6111111111111112, "mmlu_eval_accuracy_high_school_geography": 0.45454545454545453, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5714285714285714, "mmlu_eval_accuracy_high_school_macroeconomics": 0.37209302325581395, "mmlu_eval_accuracy_high_school_mathematics": 0.27586206896551724, "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, "mmlu_eval_accuracy_high_school_psychology": 0.6, "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, "mmlu_eval_accuracy_high_school_us_history": 0.5909090909090909, "mmlu_eval_accuracy_high_school_world_history": 0.4230769230769231, "mmlu_eval_accuracy_human_aging": 0.6521739130434783, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.7692307692307693, "mmlu_eval_accuracy_jurisprudence": 0.45454545454545453, "mmlu_eval_accuracy_logical_fallacies": 0.4444444444444444, "mmlu_eval_accuracy_machine_learning": 0.09090909090909091, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.84, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.5232558139534884, "mmlu_eval_accuracy_moral_disputes": 0.4473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.24, "mmlu_eval_accuracy_nutrition": 0.42424242424242425, "mmlu_eval_accuracy_philosophy": 0.5, "mmlu_eval_accuracy_prehistory": 0.34285714285714286, "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, "mmlu_eval_accuracy_professional_law": 0.32941176470588235, "mmlu_eval_accuracy_professional_medicine": 0.3548387096774194, "mmlu_eval_accuracy_professional_psychology": 0.4492753623188406, "mmlu_eval_accuracy_public_relations": 0.6666666666666666, "mmlu_eval_accuracy_security_studies": 0.4444444444444444, "mmlu_eval_accuracy_sociology": 0.45454545454545453, "mmlu_eval_accuracy_us_foreign_policy": 0.5454545454545454, "mmlu_eval_accuracy_virology": 0.3333333333333333, "mmlu_eval_accuracy_world_religions": 0.5789473684210527, "mmlu_loss": 1.2390411817058093, "step": 7854 }, { "epoch": 2.4653772215465493, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 1.4188, "step": 7855 }, { "epoch": 2.465691082427714, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 1.7146, "step": 7856 }, { "epoch": 2.4660049433088784, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 1.5056, "step": 7857 }, { "epoch": 2.466318804190043, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 1.5195, "step": 7858 }, { "epoch": 2.466632665071207, "grad_norm": 0.8828125, "learning_rate": 0.0002, "loss": 1.5999, "step": 7859 }, { "epoch": 2.4669465259523715, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 1.7737, "step": 7860 }, { "epoch": 2.467260386833536, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 1.6586, "step": 7861 }, { "epoch": 2.4675742477147002, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 1.7174, "step": 7862 }, { "epoch": 2.467888108595865, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 1.7259, "step": 7863 }, { "epoch": 2.4682019694770294, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.2782, "step": 7864 }, { "epoch": 2.4685158303581938, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 2.2778, "step": 7865 }, { "epoch": 2.468829691239358, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 1.5447, "step": 7866 }, { "epoch": 2.4691435521205225, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.4328, "step": 7867 }, { "epoch": 2.469457413001687, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 1.4528, "step": 7868 }, { "epoch": 2.4697712738828512, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 1.7626, "step": 7869 }, { "epoch": 2.470085134764016, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 1.6889, "step": 7870 }, { "epoch": 2.4703989956451804, "grad_norm": 0.87109375, "learning_rate": 0.0002, "loss": 2.2578, "step": 7871 }, { "epoch": 2.4707128565263448, "grad_norm": 0.9765625, "learning_rate": 0.0002, "loss": 2.0327, "step": 7872 }, { "epoch": 2.471026717407509, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 1.4924, "step": 7873 }, { "epoch": 2.4713405782886735, "grad_norm": 0.1953125, "learning_rate": 0.0002, "loss": 1.4465, "step": 7874 }, { "epoch": 2.471654439169838, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.0916, "step": 7875 }, { "epoch": 2.471968300051002, "grad_norm": 0.2197265625, "learning_rate": 0.0002, "loss": 1.3171, "step": 7876 }, { "epoch": 2.472282160932167, "grad_norm": 0.228515625, "learning_rate": 0.0002, "loss": 1.3244, "step": 7877 }, { "epoch": 2.4725960218133314, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.2169, "step": 7878 }, { "epoch": 2.4729098826944957, "grad_norm": 0.216796875, "learning_rate": 0.0002, "loss": 1.2338, "step": 7879 }, { "epoch": 2.47322374357566, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.212, "step": 7880 }, { "epoch": 2.4735376044568245, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.3354, "step": 7881 }, { "epoch": 2.473851465337989, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.2763, "step": 7882 }, { "epoch": 2.474165326219153, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.3363, "step": 7883 }, { "epoch": 2.474479187100318, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.183, "step": 7884 }, { "epoch": 2.4747930479814824, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 1.2574, "step": 7885 }, { "epoch": 2.4751069088626467, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.0886, "step": 7886 }, { "epoch": 2.475420769743811, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.0826, "step": 7887 }, { "epoch": 2.4757346306249755, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.1898, "step": 7888 }, { "epoch": 2.47604849150614, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.16, "step": 7889 }, { "epoch": 2.476362352387304, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.2202, "step": 7890 }, { "epoch": 2.476676213268469, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 1.1367, "step": 7891 }, { "epoch": 2.4769900741496333, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.2364, "step": 7892 }, { "epoch": 2.4773039350307977, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 1.4015, "step": 7893 }, { "epoch": 2.477617795911962, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.124, "step": 7894 }, { "epoch": 2.4779316567931264, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 1.2469, "step": 7895 }, { "epoch": 2.478245517674291, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.113, "step": 7896 }, { "epoch": 2.478559378555455, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.3857, "step": 7897 }, { "epoch": 2.4788732394366195, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 1.1959, "step": 7898 }, { "epoch": 2.4791871003177843, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 1.3862, "step": 7899 }, { "epoch": 2.4795009611989487, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 0.9915, "step": 7900 }, { "epoch": 2.479814822080113, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 1.3092, "step": 7901 }, { "epoch": 2.4801286829612774, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 1.1203, "step": 7902 }, { "epoch": 2.480442543842442, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 1.2783, "step": 7903 }, { "epoch": 2.480756404723606, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 1.2813, "step": 7904 }, { "epoch": 2.4810702656047705, "grad_norm": 0.8515625, "learning_rate": 0.0002, "loss": 1.6298, "step": 7905 }, { "epoch": 2.481384126485935, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 1.4701, "step": 7906 }, { "epoch": 2.4816979873670997, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 1.3129, "step": 7907 }, { "epoch": 2.482011848248264, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 1.619, "step": 7908 }, { "epoch": 2.4823257091294284, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 1.8144, "step": 7909 }, { "epoch": 2.4826395700105928, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 1.5604, "step": 7910 }, { "epoch": 2.482953430891757, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 1.9081, "step": 7911 }, { "epoch": 2.4832672917729215, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 1.8851, "step": 7912 }, { "epoch": 2.483581152654086, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 1.7626, "step": 7913 }, { "epoch": 2.4838950135352507, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 1.7049, "step": 7914 }, { "epoch": 2.484208874416415, "grad_norm": 1.2265625, "learning_rate": 0.0002, "loss": 2.1731, "step": 7915 }, { "epoch": 2.4845227352975794, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.3674, "step": 7916 }, { "epoch": 2.4848365961787438, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 1.708, "step": 7917 }, { "epoch": 2.485150457059908, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.2027, "step": 7918 }, { "epoch": 2.4854643179410725, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 1.5399, "step": 7919 }, { "epoch": 2.485778178822237, "grad_norm": 0.94921875, "learning_rate": 0.0002, "loss": 1.9562, "step": 7920 }, { "epoch": 2.4860920397034016, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.6843, "step": 7921 }, { "epoch": 2.486405900584566, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 2.5556, "step": 7922 }, { "epoch": 2.4867197614657304, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 1.6263, "step": 7923 }, { "epoch": 2.4870336223468947, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.1233, "step": 7924 }, { "epoch": 2.487347483228059, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.2968, "step": 7925 }, { "epoch": 2.4876613441092235, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 1.2912, "step": 7926 }, { "epoch": 2.487975204990388, "grad_norm": 0.2109375, "learning_rate": 0.0002, "loss": 1.2559, "step": 7927 }, { "epoch": 2.4882890658715526, "grad_norm": 0.2470703125, "learning_rate": 0.0002, "loss": 1.3162, "step": 7928 }, { "epoch": 2.488602926752717, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.3309, "step": 7929 }, { "epoch": 2.4889167876338814, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.3008, "step": 7930 }, { "epoch": 2.4892306485150457, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.2109, "step": 7931 }, { "epoch": 2.48954450939621, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.1883, "step": 7932 }, { "epoch": 2.4898583702773744, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.2936, "step": 7933 }, { "epoch": 2.490172231158539, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.142, "step": 7934 }, { "epoch": 2.4904860920397036, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.2195, "step": 7935 }, { "epoch": 2.490799952920868, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.3405, "step": 7936 }, { "epoch": 2.4911138138020323, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.1507, "step": 7937 }, { "epoch": 2.4914276746831967, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.3229, "step": 7938 }, { "epoch": 2.491741535564361, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.188, "step": 7939 }, { "epoch": 2.4920553964455254, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.0513, "step": 7940 }, { "epoch": 2.49236925732669, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.0351, "step": 7941 }, { "epoch": 2.4926831182078546, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.1251, "step": 7942 }, { "epoch": 2.492996979089019, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.161, "step": 7943 }, { "epoch": 2.4933108399701833, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 0.9955, "step": 7944 }, { "epoch": 2.4936247008513477, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 1.2169, "step": 7945 }, { "epoch": 2.493938561732512, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 1.2239, "step": 7946 }, { "epoch": 2.4942524226136764, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 1.2049, "step": 7947 }, { "epoch": 2.4945662834948408, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 1.3993, "step": 7948 }, { "epoch": 2.4948801443760056, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.3348, "step": 7949 }, { "epoch": 2.49519400525717, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.5381, "step": 7950 }, { "epoch": 2.4955078661383343, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 1.3664, "step": 7951 }, { "epoch": 2.4958217270194987, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.3104, "step": 7952 }, { "epoch": 2.496135587900663, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 1.3256, "step": 7953 }, { "epoch": 2.4964494487818274, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 1.3649, "step": 7954 }, { "epoch": 2.4967633096629918, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 1.7772, "step": 7955 }, { "epoch": 2.497077170544156, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 1.3112, "step": 7956 }, { "epoch": 2.4973910314253205, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.3217, "step": 7957 }, { "epoch": 2.4977048923064853, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 1.3897, "step": 7958 }, { "epoch": 2.4980187531876497, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 1.6571, "step": 7959 }, { "epoch": 2.498332614068814, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 1.557, "step": 7960 }, { "epoch": 2.4986464749499784, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.8139, "step": 7961 }, { "epoch": 2.4989603358311427, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 1.8725, "step": 7962 }, { "epoch": 2.499274196712307, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 1.738, "step": 7963 }, { "epoch": 2.4995880575934715, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 2.0403, "step": 7964 }, { "epoch": 2.4999019184746363, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 1.8007, "step": 7965 }, { "epoch": 2.5002157793558006, "grad_norm": 1.21875, "learning_rate": 0.0002, "loss": 1.7287, "step": 7966 }, { "epoch": 2.500529640236965, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 1.4855, "step": 7967 }, { "epoch": 2.5008435011181294, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 1.5514, "step": 7968 }, { "epoch": 2.5011573619992937, "grad_norm": 1.359375, "learning_rate": 0.0002, "loss": 1.3764, "step": 7969 }, { "epoch": 2.501471222880458, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.3529, "step": 7970 }, { "epoch": 2.5017850837616225, "grad_norm": 0.91015625, "learning_rate": 0.0002, "loss": 1.372, "step": 7971 }, { "epoch": 2.5020989446427873, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 1.7531, "step": 7972 }, { "epoch": 2.5024128055239516, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.4994, "step": 7973 }, { "epoch": 2.502726666405116, "grad_norm": 0.1982421875, "learning_rate": 0.0002, "loss": 1.3102, "step": 7974 }, { "epoch": 2.5030405272862803, "grad_norm": 0.2197265625, "learning_rate": 0.0002, "loss": 1.3701, "step": 7975 }, { "epoch": 2.5033543881674447, "grad_norm": 0.2001953125, "learning_rate": 0.0002, "loss": 1.2651, "step": 7976 }, { "epoch": 2.503668249048609, "grad_norm": 0.2119140625, "learning_rate": 0.0002, "loss": 1.1956, "step": 7977 }, { "epoch": 2.5039821099297734, "grad_norm": 0.2275390625, "learning_rate": 0.0002, "loss": 1.3799, "step": 7978 }, { "epoch": 2.5042959708109382, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 1.1531, "step": 7979 }, { "epoch": 2.5046098316921026, "grad_norm": 0.2197265625, "learning_rate": 0.0002, "loss": 1.1886, "step": 7980 }, { "epoch": 2.504923692573267, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.3328, "step": 7981 }, { "epoch": 2.5052375534544313, "grad_norm": 0.2373046875, "learning_rate": 0.0002, "loss": 1.305, "step": 7982 }, { "epoch": 2.5055514143355957, "grad_norm": 0.2392578125, "learning_rate": 0.0002, "loss": 1.251, "step": 7983 }, { "epoch": 2.50586527521676, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.2927, "step": 7984 }, { "epoch": 2.5061791360979244, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.2262, "step": 7985 }, { "epoch": 2.5064929969790892, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.2741, "step": 7986 }, { "epoch": 2.5068068578602536, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.1327, "step": 7987 }, { "epoch": 2.507120718741418, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.3341, "step": 7988 }, { "epoch": 2.5074345796225823, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.2381, "step": 7989 }, { "epoch": 2.5077484405037467, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.2776, "step": 7990 }, { "epoch": 2.508062301384911, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.1254, "step": 7991 }, { "epoch": 2.5083761622660754, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.1396, "step": 7992 }, { "epoch": 2.50869002314724, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 1.1002, "step": 7993 }, { "epoch": 2.5090038840284046, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 1.1628, "step": 7994 }, { "epoch": 2.509317744909569, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.3025, "step": 7995 }, { "epoch": 2.5096316057907333, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 1.107, "step": 7996 }, { "epoch": 2.5099454666718977, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 1.1702, "step": 7997 }, { "epoch": 2.510259327553062, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.212, "step": 7998 }, { "epoch": 2.5105731884342264, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.5089, "step": 7999 }, { "epoch": 2.510887049315391, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 1.3374, "step": 8000 }, { "epoch": 2.511200910196555, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.3548, "step": 8001 }, { "epoch": 2.51151477107772, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 1.3372, "step": 8002 }, { "epoch": 2.5118286319588843, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 1.3993, "step": 8003 }, { "epoch": 2.5121424928400486, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 1.4462, "step": 8004 }, { "epoch": 2.512456353721213, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 1.4214, "step": 8005 }, { "epoch": 2.5127702146023774, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 1.4534, "step": 8006 }, { "epoch": 2.513084075483542, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 1.4045, "step": 8007 }, { "epoch": 2.513397936364706, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 1.4717, "step": 8008 }, { "epoch": 2.513711797245871, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.6533, "step": 8009 }, { "epoch": 2.5140256581270353, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 1.5562, "step": 8010 }, { "epoch": 2.5143395190081996, "grad_norm": 0.8671875, "learning_rate": 0.0002, "loss": 1.9506, "step": 8011 }, { "epoch": 2.514653379889364, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 1.6297, "step": 8012 }, { "epoch": 2.5149672407705284, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 2.058, "step": 8013 }, { "epoch": 2.515281101651693, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 1.6586, "step": 8014 }, { "epoch": 2.515594962532857, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 1.2986, "step": 8015 }, { "epoch": 2.515908823414022, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 1.9196, "step": 8016 }, { "epoch": 2.5162226842951863, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 1.5324, "step": 8017 }, { "epoch": 2.5165365451763506, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 1.6906, "step": 8018 }, { "epoch": 2.516850406057515, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 1.4763, "step": 8019 }, { "epoch": 2.5171642669386793, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 1.6944, "step": 8020 }, { "epoch": 2.5174781278198437, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 1.6834, "step": 8021 }, { "epoch": 2.517791988701008, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 2.3512, "step": 8022 }, { "epoch": 2.518105849582173, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.689, "step": 8023 }, { "epoch": 2.5184197104633372, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.2366, "step": 8024 }, { "epoch": 2.5187335713445016, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.2755, "step": 8025 }, { "epoch": 2.519047432225666, "grad_norm": 0.2275390625, "learning_rate": 0.0002, "loss": 1.201, "step": 8026 }, { "epoch": 2.5193612931068303, "grad_norm": 0.2197265625, "learning_rate": 0.0002, "loss": 1.3067, "step": 8027 }, { "epoch": 2.5196751539879947, "grad_norm": 0.2373046875, "learning_rate": 0.0002, "loss": 1.2367, "step": 8028 }, { "epoch": 2.519989014869159, "grad_norm": 0.24609375, "learning_rate": 0.0002, "loss": 1.3592, "step": 8029 }, { "epoch": 2.520302875750324, "grad_norm": 0.2333984375, "learning_rate": 0.0002, "loss": 1.3156, "step": 8030 }, { "epoch": 2.520616736631488, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.3446, "step": 8031 }, { "epoch": 2.5209305975126526, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.1283, "step": 8032 }, { "epoch": 2.521244458393817, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 1.3188, "step": 8033 }, { "epoch": 2.5215583192749813, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.1979, "step": 8034 }, { "epoch": 2.5218721801561457, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.2261, "step": 8035 }, { "epoch": 2.52218604103731, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.1235, "step": 8036 }, { "epoch": 2.522499901918475, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.142, "step": 8037 }, { "epoch": 2.522813762799639, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.201, "step": 8038 }, { "epoch": 2.5231276236808036, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.136, "step": 8039 }, { "epoch": 2.523441484561968, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 1.2224, "step": 8040 }, { "epoch": 2.5237553454431323, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 1.2604, "step": 8041 }, { "epoch": 2.5237553454431323, "eval_loss": 1.8607443571090698, "eval_runtime": 123.55, "eval_samples_per_second": 8.094, "eval_steps_per_second": 8.094, "step": 8041 }, { "epoch": 2.5237553454431323, "mmlu_eval_accuracy": 0.4183529141967123, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5714285714285714, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, "mmlu_eval_accuracy_clinical_knowledge": 0.4482758620689655, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.36363636363636365, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.2727272727272727, "mmlu_eval_accuracy_conceptual_physics": 0.38461538461538464, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.375, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.35714285714285715, "mmlu_eval_accuracy_global_facts": 0.4, "mmlu_eval_accuracy_high_school_biology": 0.34375, "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365, "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666, "mmlu_eval_accuracy_high_school_european_history": 0.6111111111111112, "mmlu_eval_accuracy_high_school_geography": 0.5454545454545454, "mmlu_eval_accuracy_high_school_government_and_politics": 0.42857142857142855, "mmlu_eval_accuracy_high_school_macroeconomics": 0.4418604651162791, "mmlu_eval_accuracy_high_school_mathematics": 0.27586206896551724, "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.6333333333333333, "mmlu_eval_accuracy_high_school_statistics": 0.08695652173913043, "mmlu_eval_accuracy_high_school_us_history": 0.6363636363636364, "mmlu_eval_accuracy_high_school_world_history": 0.4230769230769231, "mmlu_eval_accuracy_human_aging": 0.5652173913043478, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.8461538461538461, "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, "mmlu_eval_accuracy_logical_fallacies": 0.5555555555555556, "mmlu_eval_accuracy_machine_learning": 0.45454545454545453, "mmlu_eval_accuracy_management": 0.45454545454545453, "mmlu_eval_accuracy_marketing": 0.72, "mmlu_eval_accuracy_medical_genetics": 0.8181818181818182, "mmlu_eval_accuracy_miscellaneous": 0.5232558139534884, "mmlu_eval_accuracy_moral_disputes": 0.42105263157894735, "mmlu_eval_accuracy_moral_scenarios": 0.26, "mmlu_eval_accuracy_nutrition": 0.3939393939393939, "mmlu_eval_accuracy_philosophy": 0.47058823529411764, "mmlu_eval_accuracy_prehistory": 0.3142857142857143, "mmlu_eval_accuracy_professional_accounting": 0.22580645161290322, "mmlu_eval_accuracy_professional_law": 0.3352941176470588, "mmlu_eval_accuracy_professional_medicine": 0.3870967741935484, "mmlu_eval_accuracy_professional_psychology": 0.43478260869565216, "mmlu_eval_accuracy_public_relations": 0.5833333333333334, "mmlu_eval_accuracy_security_studies": 0.3333333333333333, "mmlu_eval_accuracy_sociology": 0.45454545454545453, "mmlu_eval_accuracy_us_foreign_policy": 0.5454545454545454, "mmlu_eval_accuracy_virology": 0.2777777777777778, "mmlu_eval_accuracy_world_religions": 0.5789473684210527, "mmlu_loss": 1.2723540784213216, "step": 8041 }, { "epoch": 2.5240692063242967, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.2616, "step": 8042 }, { "epoch": 2.524383067205461, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.0516, "step": 8043 }, { "epoch": 2.524696928086626, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 1.2961, "step": 8044 }, { "epoch": 2.52501078896779, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 1.2861, "step": 8045 }, { "epoch": 2.5253246498489546, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.0504, "step": 8046 }, { "epoch": 2.525638510730119, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.3087, "step": 8047 }, { "epoch": 2.5259523716112833, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 1.1018, "step": 8048 }, { "epoch": 2.5262662324924476, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 1.4948, "step": 8049 }, { "epoch": 2.526580093373612, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 1.0693, "step": 8050 }, { "epoch": 2.526893954254777, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 1.274, "step": 8051 }, { "epoch": 2.5272078151359407, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 1.4565, "step": 8052 }, { "epoch": 2.5275216760171055, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 1.5235, "step": 8053 }, { "epoch": 2.52783553689827, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 1.5113, "step": 8054 }, { "epoch": 2.5281493977794343, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.4896, "step": 8055 }, { "epoch": 2.5284632586605986, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 1.741, "step": 8056 }, { "epoch": 2.528777119541763, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 1.6264, "step": 8057 }, { "epoch": 2.529090980422928, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 1.538, "step": 8058 }, { "epoch": 2.5294048413040917, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 1.7576, "step": 8059 }, { "epoch": 2.5297187021852565, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 1.4019, "step": 8060 }, { "epoch": 2.530032563066421, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.5436, "step": 8061 }, { "epoch": 2.5303464239475852, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 1.9869, "step": 8062 }, { "epoch": 2.5306602848287496, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 1.6505, "step": 8063 }, { "epoch": 2.530974145709914, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 1.7756, "step": 8064 }, { "epoch": 2.5312880065910788, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.7569, "step": 8065 }, { "epoch": 2.5316018674722427, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 1.69, "step": 8066 }, { "epoch": 2.5319157283534075, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 1.3747, "step": 8067 }, { "epoch": 2.532229589234572, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 1.4028, "step": 8068 }, { "epoch": 2.5325434501157362, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 1.5025, "step": 8069 }, { "epoch": 2.5328573109969006, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 1.563, "step": 8070 }, { "epoch": 2.533171171878065, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 1.8703, "step": 8071 }, { "epoch": 2.5334850327592293, "grad_norm": 0.90234375, "learning_rate": 0.0002, "loss": 2.1131, "step": 8072 }, { "epoch": 2.5337988936403937, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 1.5661, "step": 8073 }, { "epoch": 2.5341127545215585, "grad_norm": 0.1953125, "learning_rate": 0.0002, "loss": 1.3115, "step": 8074 }, { "epoch": 2.534426615402723, "grad_norm": 0.185546875, "learning_rate": 0.0002, "loss": 1.2665, "step": 8075 }, { "epoch": 2.534740476283887, "grad_norm": 0.1923828125, "learning_rate": 0.0002, "loss": 1.4033, "step": 8076 }, { "epoch": 2.5350543371650516, "grad_norm": 0.224609375, "learning_rate": 0.0002, "loss": 1.2886, "step": 8077 }, { "epoch": 2.535368198046216, "grad_norm": 0.2001953125, "learning_rate": 0.0002, "loss": 1.1756, "step": 8078 }, { "epoch": 2.5356820589273803, "grad_norm": 0.203125, "learning_rate": 0.0002, "loss": 1.2742, "step": 8079 }, { "epoch": 2.5359959198085447, "grad_norm": 0.2060546875, "learning_rate": 0.0002, "loss": 1.1791, "step": 8080 }, { "epoch": 2.5363097806897095, "grad_norm": 0.2255859375, "learning_rate": 0.0002, "loss": 1.2411, "step": 8081 }, { "epoch": 2.536623641570874, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.3197, "step": 8082 }, { "epoch": 2.536937502452038, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.2508, "step": 8083 }, { "epoch": 2.5372513633332026, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.2557, "step": 8084 }, { "epoch": 2.537565224214367, "grad_norm": 0.236328125, "learning_rate": 0.0002, "loss": 1.2187, "step": 8085 }, { "epoch": 2.5378790850955313, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.2556, "step": 8086 }, { "epoch": 2.5381929459766956, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.3464, "step": 8087 }, { "epoch": 2.5385068068578605, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.156, "step": 8088 }, { "epoch": 2.538820667739025, "grad_norm": 0.2470703125, "learning_rate": 0.0002, "loss": 1.19, "step": 8089 }, { "epoch": 2.539134528620189, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.2837, "step": 8090 }, { "epoch": 2.5394483895013535, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.187, "step": 8091 }, { "epoch": 2.539762250382518, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.2087, "step": 8092 }, { "epoch": 2.5400761112636823, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.1529, "step": 8093 }, { "epoch": 2.5403899721448466, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 1.2803, "step": 8094 }, { "epoch": 2.5407038330260114, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 1.2249, "step": 8095 }, { "epoch": 2.541017693907176, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 1.2193, "step": 8096 }, { "epoch": 2.54133155478834, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 1.2398, "step": 8097 }, { "epoch": 2.5416454156695045, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 1.3342, "step": 8098 }, { "epoch": 2.541959276550669, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 1.3602, "step": 8099 }, { "epoch": 2.5422731374318333, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.2073, "step": 8100 }, { "epoch": 2.5425869983129976, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 1.1154, "step": 8101 }, { "epoch": 2.5429008591941624, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 1.3494, "step": 8102 }, { "epoch": 2.5432147200753263, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 1.5837, "step": 8103 }, { "epoch": 2.543528580956491, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 1.5074, "step": 8104 }, { "epoch": 2.5438424418376555, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 1.4006, "step": 8105 }, { "epoch": 2.54415630271882, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 1.8382, "step": 8106 }, { "epoch": 2.5444701635999842, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 1.9058, "step": 8107 }, { "epoch": 2.5447840244811486, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 1.4466, "step": 8108 }, { "epoch": 2.5450978853623134, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 1.5384, "step": 8109 }, { "epoch": 2.5454117462434773, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 1.7193, "step": 8110 }, { "epoch": 2.545725607124642, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 1.823, "step": 8111 }, { "epoch": 2.5460394680058065, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 1.5359, "step": 8112 }, { "epoch": 2.546353328886971, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 1.5397, "step": 8113 }, { "epoch": 2.546667189768135, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 1.6799, "step": 8114 }, { "epoch": 2.5469810506492996, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 1.7347, "step": 8115 }, { "epoch": 2.5472949115304644, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 1.5677, "step": 8116 }, { "epoch": 2.5476087724116283, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 1.8451, "step": 8117 }, { "epoch": 2.547922633292793, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 1.8102, "step": 8118 }, { "epoch": 2.5482364941739575, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 1.5335, "step": 8119 }, { "epoch": 2.548550355055122, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 1.5471, "step": 8120 }, { "epoch": 2.548864215936286, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 1.6972, "step": 8121 }, { "epoch": 2.5491780768174506, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 2.1062, "step": 8122 }, { "epoch": 2.549491937698615, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 1.6585, "step": 8123 }, { "epoch": 2.5498057985797793, "grad_norm": 0.1845703125, "learning_rate": 0.0002, "loss": 1.3591, "step": 8124 }, { "epoch": 2.550119659460944, "grad_norm": 0.208984375, "learning_rate": 0.0002, "loss": 1.4368, "step": 8125 }, { "epoch": 2.5504335203421085, "grad_norm": 0.212890625, "learning_rate": 0.0002, "loss": 1.3583, "step": 8126 }, { "epoch": 2.550747381223273, "grad_norm": 0.2138671875, "learning_rate": 0.0002, "loss": 1.4122, "step": 8127 }, { "epoch": 2.551061242104437, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.3568, "step": 8128 }, { "epoch": 2.5513751029856016, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.2018, "step": 8129 }, { "epoch": 2.551688963866766, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.2514, "step": 8130 }, { "epoch": 2.5520028247479303, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.1808, "step": 8131 }, { "epoch": 2.552316685629095, "grad_norm": 0.236328125, "learning_rate": 0.0002, "loss": 1.2321, "step": 8132 }, { "epoch": 2.5526305465102594, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 1.1657, "step": 8133 }, { "epoch": 2.552944407391424, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.3385, "step": 8134 }, { "epoch": 2.553258268272588, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.2294, "step": 8135 }, { "epoch": 2.5535721291537525, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.2754, "step": 8136 }, { "epoch": 2.553885990034917, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.2386, "step": 8137 }, { "epoch": 2.5541998509160813, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.0577, "step": 8138 }, { "epoch": 2.554513711797246, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.2231, "step": 8139 }, { "epoch": 2.5548275726784104, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.1979, "step": 8140 }, { "epoch": 2.555141433559575, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.1414, "step": 8141 }, { "epoch": 2.555455294440739, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.1689, "step": 8142 }, { "epoch": 2.5557691553219035, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.1552, "step": 8143 }, { "epoch": 2.556083016203068, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.1334, "step": 8144 }, { "epoch": 2.5563968770842322, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 1.16, "step": 8145 }, { "epoch": 2.556710737965397, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.3493, "step": 8146 }, { "epoch": 2.5570245988465614, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 1.1908, "step": 8147 }, { "epoch": 2.5573384597277258, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 1.2483, "step": 8148 }, { "epoch": 2.55765232060889, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 1.5053, "step": 8149 }, { "epoch": 2.5579661814900545, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 0.9777, "step": 8150 }, { "epoch": 2.558280042371219, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 1.3726, "step": 8151 }, { "epoch": 2.5585939032523832, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 1.4535, "step": 8152 }, { "epoch": 2.558907764133548, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.4632, "step": 8153 }, { "epoch": 2.559221625014712, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 1.5399, "step": 8154 }, { "epoch": 2.5595354858958768, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 1.3817, "step": 8155 }, { "epoch": 2.559849346777041, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 1.6101, "step": 8156 }, { "epoch": 2.5601632076582055, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 1.5083, "step": 8157 }, { "epoch": 2.56047706853937, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 1.4321, "step": 8158 }, { "epoch": 2.560790929420534, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.9173, "step": 8159 }, { "epoch": 2.561104790301699, "grad_norm": 1.234375, "learning_rate": 0.0002, "loss": 1.8915, "step": 8160 }, { "epoch": 2.561418651182863, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 1.6727, "step": 8161 }, { "epoch": 2.5617325120640277, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 1.7423, "step": 8162 }, { "epoch": 2.562046372945192, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 1.8554, "step": 8163 }, { "epoch": 2.5623602338263565, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 1.7873, "step": 8164 }, { "epoch": 2.562674094707521, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 1.4724, "step": 8165 }, { "epoch": 2.562987955588685, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 1.6006, "step": 8166 }, { "epoch": 2.56330181646985, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.5194, "step": 8167 }, { "epoch": 2.563615677351014, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 1.6557, "step": 8168 }, { "epoch": 2.5639295382321787, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 1.5988, "step": 8169 }, { "epoch": 2.564243399113343, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 1.6732, "step": 8170 }, { "epoch": 2.5645572599945075, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 1.971, "step": 8171 }, { "epoch": 2.564871120875672, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 1.534, "step": 8172 }, { "epoch": 2.565184981756836, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.612, "step": 8173 }, { "epoch": 2.5654988426380005, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.3118, "step": 8174 }, { "epoch": 2.565812703519165, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.186, "step": 8175 }, { "epoch": 2.5661265644003297, "grad_norm": 0.203125, "learning_rate": 0.0002, "loss": 1.2368, "step": 8176 }, { "epoch": 2.566440425281494, "grad_norm": 0.2265625, "learning_rate": 0.0002, "loss": 1.2674, "step": 8177 }, { "epoch": 2.5667542861626584, "grad_norm": 0.220703125, "learning_rate": 0.0002, "loss": 1.1439, "step": 8178 }, { "epoch": 2.567068147043823, "grad_norm": 0.21484375, "learning_rate": 0.0002, "loss": 1.274, "step": 8179 }, { "epoch": 2.567382007924987, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.1665, "step": 8180 }, { "epoch": 2.5676958688061515, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.3386, "step": 8181 }, { "epoch": 2.568009729687316, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.1246, "step": 8182 }, { "epoch": 2.5683235905684807, "grad_norm": 0.2216796875, "learning_rate": 0.0002, "loss": 1.2216, "step": 8183 }, { "epoch": 2.568637451449645, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.3027, "step": 8184 }, { "epoch": 2.5689513123308094, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 1.2367, "step": 8185 }, { "epoch": 2.569265173211974, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.3236, "step": 8186 }, { "epoch": 2.569579034093138, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.2682, "step": 8187 }, { "epoch": 2.5698928949743025, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.2099, "step": 8188 }, { "epoch": 2.570206755855467, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.1102, "step": 8189 }, { "epoch": 2.5705206167366317, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 1.3295, "step": 8190 }, { "epoch": 2.570834477617796, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 1.2174, "step": 8191 }, { "epoch": 2.5711483384989604, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 1.3828, "step": 8192 }, { "epoch": 2.5714621993801248, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 1.1708, "step": 8193 }, { "epoch": 2.571776060261289, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 1.0845, "step": 8194 }, { "epoch": 2.5720899211424535, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 1.2666, "step": 8195 }, { "epoch": 2.572403782023618, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.0794, "step": 8196 }, { "epoch": 2.5727176429047827, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 1.0743, "step": 8197 }, { "epoch": 2.573031503785947, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.1639, "step": 8198 }, { "epoch": 2.5733453646671114, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 1.3559, "step": 8199 }, { "epoch": 2.5736592255482758, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.1107, "step": 8200 }, { "epoch": 2.57397308642944, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 1.4383, "step": 8201 }, { "epoch": 2.5742869473106045, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.4989, "step": 8202 }, { "epoch": 2.574600808191769, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 1.4513, "step": 8203 }, { "epoch": 2.5749146690729336, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 1.3357, "step": 8204 }, { "epoch": 2.575228529954098, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.5024, "step": 8205 }, { "epoch": 2.5755423908352624, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.3976, "step": 8206 }, { "epoch": 2.5758562517164267, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.3133, "step": 8207 }, { "epoch": 2.576170112597591, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 1.3584, "step": 8208 }, { "epoch": 2.5764839734787555, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 1.5127, "step": 8209 }, { "epoch": 2.57679783435992, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 1.5952, "step": 8210 }, { "epoch": 2.5771116952410846, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 1.488, "step": 8211 }, { "epoch": 2.5774255561222486, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 1.472, "step": 8212 }, { "epoch": 2.5777394170034134, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 1.7622, "step": 8213 }, { "epoch": 2.5780532778845777, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 1.877, "step": 8214 }, { "epoch": 2.578367138765742, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 1.4925, "step": 8215 }, { "epoch": 2.5786809996469064, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 1.4816, "step": 8216 }, { "epoch": 2.578994860528071, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 1.6054, "step": 8217 }, { "epoch": 2.5793087214092356, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.3514, "step": 8218 }, { "epoch": 2.5796225822903995, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 1.2207, "step": 8219 }, { "epoch": 2.5799364431715643, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 1.5068, "step": 8220 }, { "epoch": 2.5802503040527287, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 1.7892, "step": 8221 }, { "epoch": 2.580564164933893, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 2.6314, "step": 8222 }, { "epoch": 2.5808780258150574, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 1.6758, "step": 8223 }, { "epoch": 2.581191886696222, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.4171, "step": 8224 }, { "epoch": 2.5815057475773866, "grad_norm": 0.2001953125, "learning_rate": 0.0002, "loss": 1.2907, "step": 8225 }, { "epoch": 2.5818196084585505, "grad_norm": 0.2470703125, "learning_rate": 0.0002, "loss": 1.2336, "step": 8226 }, { "epoch": 2.5821334693397153, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.2096, "step": 8227 }, { "epoch": 2.5824473302208797, "grad_norm": 0.2412109375, "learning_rate": 0.0002, "loss": 1.2043, "step": 8228 }, { "epoch": 2.5824473302208797, "eval_loss": 1.9074493646621704, "eval_runtime": 123.3494, "eval_samples_per_second": 8.107, "eval_steps_per_second": 8.107, "step": 8228 }, { "epoch": 2.5824473302208797, "mmlu_eval_accuracy": 0.41454449161637824, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, "mmlu_eval_accuracy_clinical_knowledge": 0.4827586206896552, "mmlu_eval_accuracy_college_biology": 0.375, "mmlu_eval_accuracy_college_chemistry": 0.0, "mmlu_eval_accuracy_college_computer_science": 0.36363636363636365, "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, "mmlu_eval_accuracy_college_medicine": 0.4090909090909091, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.375, "mmlu_eval_accuracy_elementary_mathematics": 0.21951219512195122, "mmlu_eval_accuracy_formal_logic": 0.2857142857142857, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.3125, "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556, "mmlu_eval_accuracy_high_school_european_history": 0.6111111111111112, "mmlu_eval_accuracy_high_school_geography": 0.5454545454545454, "mmlu_eval_accuracy_high_school_government_and_politics": 0.47619047619047616, "mmlu_eval_accuracy_high_school_macroeconomics": 0.5116279069767442, "mmlu_eval_accuracy_high_school_mathematics": 0.2413793103448276, "mmlu_eval_accuracy_high_school_microeconomics": 0.4230769230769231, "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, "mmlu_eval_accuracy_high_school_psychology": 0.65, "mmlu_eval_accuracy_high_school_statistics": 0.13043478260869565, "mmlu_eval_accuracy_high_school_us_history": 0.6818181818181818, "mmlu_eval_accuracy_high_school_world_history": 0.46153846153846156, "mmlu_eval_accuracy_human_aging": 0.6086956521739131, "mmlu_eval_accuracy_human_sexuality": 0.25, "mmlu_eval_accuracy_international_law": 0.7692307692307693, "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, "mmlu_eval_accuracy_logical_fallacies": 0.5, "mmlu_eval_accuracy_machine_learning": 0.09090909090909091, "mmlu_eval_accuracy_management": 0.45454545454545453, "mmlu_eval_accuracy_marketing": 0.68, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.5, "mmlu_eval_accuracy_moral_disputes": 0.5, "mmlu_eval_accuracy_moral_scenarios": 0.29, "mmlu_eval_accuracy_nutrition": 0.5757575757575758, "mmlu_eval_accuracy_philosophy": 0.4411764705882353, "mmlu_eval_accuracy_prehistory": 0.3142857142857143, "mmlu_eval_accuracy_professional_accounting": 0.25806451612903225, "mmlu_eval_accuracy_professional_law": 0.32941176470588235, "mmlu_eval_accuracy_professional_medicine": 0.3870967741935484, "mmlu_eval_accuracy_professional_psychology": 0.4492753623188406, "mmlu_eval_accuracy_public_relations": 0.5833333333333334, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.45454545454545453, "mmlu_eval_accuracy_us_foreign_policy": 0.5454545454545454, "mmlu_eval_accuracy_virology": 0.2777777777777778, "mmlu_eval_accuracy_world_religions": 0.5789473684210527, "mmlu_loss": 1.1899194760426908, "step": 8228 }, { "epoch": 2.582761191102044, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.2567, "step": 8229 }, { "epoch": 2.5830750519832084, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.2747, "step": 8230 }, { "epoch": 2.5833889128643728, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 1.3675, "step": 8231 }, { "epoch": 2.583702773745537, "grad_norm": 0.2490234375, "learning_rate": 0.0002, "loss": 1.2292, "step": 8232 }, { "epoch": 2.5840166346267015, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.2752, "step": 8233 }, { "epoch": 2.5843304955078663, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.16, "step": 8234 }, { "epoch": 2.5846443563890307, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.2344, "step": 8235 }, { "epoch": 2.584958217270195, "grad_norm": 0.2373046875, "learning_rate": 0.0002, "loss": 1.244, "step": 8236 }, { "epoch": 2.5852720781513594, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.1552, "step": 8237 }, { "epoch": 2.5855859390325238, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.1759, "step": 8238 }, { "epoch": 2.585899799913688, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.123, "step": 8239 }, { "epoch": 2.5862136607948525, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.1789, "step": 8240 }, { "epoch": 2.5865275216760173, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.2807, "step": 8241 }, { "epoch": 2.5868413825571817, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 1.3216, "step": 8242 }, { "epoch": 2.587155243438346, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.1823, "step": 8243 }, { "epoch": 2.5874691043195104, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.0445, "step": 8244 }, { "epoch": 2.5877829652006747, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 1.1879, "step": 8245 }, { "epoch": 2.588096826081839, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 1.2182, "step": 8246 }, { "epoch": 2.5884106869630035, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.1286, "step": 8247 }, { "epoch": 2.5887245478441683, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.2157, "step": 8248 }, { "epoch": 2.5890384087253326, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 1.1246, "step": 8249 }, { "epoch": 2.589352269606497, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 1.3203, "step": 8250 }, { "epoch": 2.5896661304876614, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.2874, "step": 8251 }, { "epoch": 2.5899799913688257, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 1.3615, "step": 8252 }, { "epoch": 2.59029385224999, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 1.2743, "step": 8253 }, { "epoch": 2.5906077131311545, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 1.5896, "step": 8254 }, { "epoch": 2.5909215740123193, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 1.5021, "step": 8255 }, { "epoch": 2.5912354348934836, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 1.5988, "step": 8256 }, { "epoch": 2.591549295774648, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 1.7784, "step": 8257 }, { "epoch": 2.5918631566558124, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 1.8666, "step": 8258 }, { "epoch": 2.5921770175369767, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.5718, "step": 8259 }, { "epoch": 2.592490878418141, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 1.5626, "step": 8260 }, { "epoch": 2.5928047392993054, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 1.8873, "step": 8261 }, { "epoch": 2.5931186001804702, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 1.7911, "step": 8262 }, { "epoch": 2.593432461061634, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 2.0731, "step": 8263 }, { "epoch": 2.593746321942799, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 2.0167, "step": 8264 }, { "epoch": 2.5940601828239633, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 1.5655, "step": 8265 }, { "epoch": 2.5943740437051277, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 1.531, "step": 8266 }, { "epoch": 2.594687904586292, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 1.603, "step": 8267 }, { "epoch": 2.5950017654674564, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 1.8439, "step": 8268 }, { "epoch": 2.5953156263486212, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 1.5067, "step": 8269 }, { "epoch": 2.595629487229785, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 1.4847, "step": 8270 }, { "epoch": 2.59594334811095, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 1.6979, "step": 8271 }, { "epoch": 2.5962572089921143, "grad_norm": 0.90625, "learning_rate": 0.0002, "loss": 2.0087, "step": 8272 }, { "epoch": 2.5965710698732787, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.5709, "step": 8273 }, { "epoch": 2.596884930754443, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.2485, "step": 8274 }, { "epoch": 2.5971987916356074, "grad_norm": 0.1923828125, "learning_rate": 0.0002, "loss": 1.2437, "step": 8275 }, { "epoch": 2.597512652516772, "grad_norm": 0.2001953125, "learning_rate": 0.0002, "loss": 1.264, "step": 8276 }, { "epoch": 2.597826513397936, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 1.3852, "step": 8277 }, { "epoch": 2.598140374279101, "grad_norm": 0.2158203125, "learning_rate": 0.0002, "loss": 1.2733, "step": 8278 }, { "epoch": 2.5984542351602653, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.3565, "step": 8279 }, { "epoch": 2.5987680960414297, "grad_norm": 0.2314453125, "learning_rate": 0.0002, "loss": 1.3105, "step": 8280 }, { "epoch": 2.599081956922594, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.4101, "step": 8281 }, { "epoch": 2.5993958178037584, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.3282, "step": 8282 }, { "epoch": 2.5997096786849228, "grad_norm": 0.23828125, "learning_rate": 0.0002, "loss": 1.2049, "step": 8283 }, { "epoch": 2.600023539566087, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.202, "step": 8284 }, { "epoch": 2.600337400447252, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.3742, "step": 8285 }, { "epoch": 2.6006512613284163, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 1.2715, "step": 8286 }, { "epoch": 2.6009651222095806, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.2058, "step": 8287 }, { "epoch": 2.601278983090745, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.2405, "step": 8288 }, { "epoch": 2.6015928439719094, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.1919, "step": 8289 }, { "epoch": 2.6019067048530737, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.0828, "step": 8290 }, { "epoch": 2.602220565734238, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 1.1654, "step": 8291 }, { "epoch": 2.602534426615403, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.2461, "step": 8292 }, { "epoch": 2.6028482874965673, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.1752, "step": 8293 }, { "epoch": 2.6031621483777316, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 1.145, "step": 8294 }, { "epoch": 2.603476009258896, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.1916, "step": 8295 }, { "epoch": 2.6037898701400604, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.3542, "step": 8296 }, { "epoch": 2.6041037310212247, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.3022, "step": 8297 }, { "epoch": 2.604417591902389, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 1.3625, "step": 8298 }, { "epoch": 2.604731452783554, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 1.06, "step": 8299 }, { "epoch": 2.6050453136647183, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.6145, "step": 8300 }, { "epoch": 2.6053591745458826, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 1.2478, "step": 8301 }, { "epoch": 2.605673035427047, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.3985, "step": 8302 }, { "epoch": 2.6059868963082113, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 1.4626, "step": 8303 }, { "epoch": 2.6063007571893757, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 1.2528, "step": 8304 }, { "epoch": 2.60661461807054, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 1.5425, "step": 8305 }, { "epoch": 2.606928478951705, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 1.4051, "step": 8306 }, { "epoch": 2.6072423398328692, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 1.2567, "step": 8307 }, { "epoch": 2.6075562007140336, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 1.4044, "step": 8308 }, { "epoch": 2.607870061595198, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.563, "step": 8309 }, { "epoch": 2.6081839224763623, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 1.6939, "step": 8310 }, { "epoch": 2.6084977833575267, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 1.5868, "step": 8311 }, { "epoch": 2.608811644238691, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 1.9556, "step": 8312 }, { "epoch": 2.609125505119856, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 2.0743, "step": 8313 }, { "epoch": 2.6094393660010198, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 1.6938, "step": 8314 }, { "epoch": 2.6097532268821846, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 1.8604, "step": 8315 }, { "epoch": 2.610067087763349, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 2.0713, "step": 8316 }, { "epoch": 2.6103809486445133, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 1.5262, "step": 8317 }, { "epoch": 2.6106948095256777, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 2.0565, "step": 8318 }, { "epoch": 2.611008670406842, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 1.8823, "step": 8319 }, { "epoch": 2.611322531288007, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 1.5251, "step": 8320 }, { "epoch": 2.6116363921691708, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 1.4625, "step": 8321 }, { "epoch": 2.6119502530503356, "grad_norm": 0.94921875, "learning_rate": 0.0002, "loss": 2.7737, "step": 8322 }, { "epoch": 2.6122641139315, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 1.6456, "step": 8323 }, { "epoch": 2.6125779748126643, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.1549, "step": 8324 }, { "epoch": 2.6128918356938287, "grad_norm": 0.224609375, "learning_rate": 0.0002, "loss": 1.3724, "step": 8325 }, { "epoch": 2.613205696574993, "grad_norm": 0.220703125, "learning_rate": 0.0002, "loss": 1.2699, "step": 8326 }, { "epoch": 2.613519557456158, "grad_norm": 0.236328125, "learning_rate": 0.0002, "loss": 1.2312, "step": 8327 }, { "epoch": 2.6138334183373217, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.1906, "step": 8328 }, { "epoch": 2.6141472792184866, "grad_norm": 0.2470703125, "learning_rate": 0.0002, "loss": 1.2342, "step": 8329 }, { "epoch": 2.614461140099651, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.1849, "step": 8330 }, { "epoch": 2.6147750009808153, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.2706, "step": 8331 }, { "epoch": 2.6150888618619796, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.3209, "step": 8332 }, { "epoch": 2.615402722743144, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.3661, "step": 8333 }, { "epoch": 2.6157165836243084, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.3045, "step": 8334 }, { "epoch": 2.6160304445054727, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.3723, "step": 8335 }, { "epoch": 2.6163443053866375, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.2643, "step": 8336 }, { "epoch": 2.616658166267802, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.0759, "step": 8337 }, { "epoch": 2.6169720271489663, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.2399, "step": 8338 }, { "epoch": 2.6172858880301306, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.2641, "step": 8339 }, { "epoch": 2.617599748911295, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.1964, "step": 8340 }, { "epoch": 2.6179136097924594, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.0877, "step": 8341 }, { "epoch": 2.6182274706736237, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 0.9945, "step": 8342 }, { "epoch": 2.6185413315547885, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.0591, "step": 8343 }, { "epoch": 2.618855192435953, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 1.2723, "step": 8344 }, { "epoch": 2.6191690533171172, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 1.0151, "step": 8345 }, { "epoch": 2.6194829141982816, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 1.1581, "step": 8346 }, { "epoch": 2.619796775079446, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.2291, "step": 8347 }, { "epoch": 2.6201106359606103, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.2839, "step": 8348 }, { "epoch": 2.6204244968417747, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 1.3063, "step": 8349 }, { "epoch": 2.6207383577229395, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 1.3656, "step": 8350 }, { "epoch": 2.621052218604104, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.3309, "step": 8351 }, { "epoch": 2.6213660794852682, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 1.3024, "step": 8352 }, { "epoch": 2.6216799403664326, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.1654, "step": 8353 }, { "epoch": 2.621993801247597, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 1.3907, "step": 8354 }, { "epoch": 2.6223076621287613, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 1.5283, "step": 8355 }, { "epoch": 2.6226215230099257, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.2897, "step": 8356 }, { "epoch": 2.6229353838910905, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 1.6339, "step": 8357 }, { "epoch": 2.623249244772255, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 1.5963, "step": 8358 }, { "epoch": 2.623563105653419, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 1.7894, "step": 8359 }, { "epoch": 2.6238769665345836, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 1.7884, "step": 8360 }, { "epoch": 2.624190827415748, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 1.5161, "step": 8361 }, { "epoch": 2.6245046882969123, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.5421, "step": 8362 }, { "epoch": 2.6248185491780767, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 1.5299, "step": 8363 }, { "epoch": 2.6251324100592415, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 1.8635, "step": 8364 }, { "epoch": 2.6254462709404054, "grad_norm": 1.46875, "learning_rate": 0.0002, "loss": 1.841, "step": 8365 }, { "epoch": 2.62576013182157, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 1.7608, "step": 8366 }, { "epoch": 2.6260739927027346, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 1.3834, "step": 8367 }, { "epoch": 2.626387853583899, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 1.4079, "step": 8368 }, { "epoch": 2.6267017144650633, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.2541, "step": 8369 }, { "epoch": 2.6270155753462276, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 1.3591, "step": 8370 }, { "epoch": 2.6273294362273925, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 1.4543, "step": 8371 }, { "epoch": 2.6276432971085564, "grad_norm": 1.0859375, "learning_rate": 0.0002, "loss": 2.1663, "step": 8372 }, { "epoch": 2.627957157989721, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 1.6518, "step": 8373 }, { "epoch": 2.6282710188708855, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.3158, "step": 8374 }, { "epoch": 2.62858487975205, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.2642, "step": 8375 }, { "epoch": 2.6288987406332143, "grad_norm": 0.2060546875, "learning_rate": 0.0002, "loss": 1.1811, "step": 8376 }, { "epoch": 2.6292126015143786, "grad_norm": 0.208984375, "learning_rate": 0.0002, "loss": 1.2948, "step": 8377 }, { "epoch": 2.6295264623955434, "grad_norm": 0.2197265625, "learning_rate": 0.0002, "loss": 1.2624, "step": 8378 }, { "epoch": 2.6298403232767074, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 1.2263, "step": 8379 }, { "epoch": 2.630154184157872, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.3232, "step": 8380 }, { "epoch": 2.6304680450390365, "grad_norm": 0.2392578125, "learning_rate": 0.0002, "loss": 1.1449, "step": 8381 }, { "epoch": 2.630781905920201, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.3683, "step": 8382 }, { "epoch": 2.6310957668013653, "grad_norm": 0.248046875, "learning_rate": 0.0002, "loss": 1.1488, "step": 8383 }, { "epoch": 2.6314096276825296, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.1882, "step": 8384 }, { "epoch": 2.631723488563694, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.1748, "step": 8385 }, { "epoch": 2.6320373494448583, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.2578, "step": 8386 }, { "epoch": 2.632351210326023, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.2396, "step": 8387 }, { "epoch": 2.6326650712071875, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.1114, "step": 8388 }, { "epoch": 2.632978932088352, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 1.2044, "step": 8389 }, { "epoch": 2.6332927929695162, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 1.3455, "step": 8390 }, { "epoch": 2.6336066538506806, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.1319, "step": 8391 }, { "epoch": 2.633920514731845, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.1566, "step": 8392 }, { "epoch": 2.6342343756130093, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 1.156, "step": 8393 }, { "epoch": 2.634548236494174, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 1.0712, "step": 8394 }, { "epoch": 2.6348620973753385, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 1.2226, "step": 8395 }, { "epoch": 2.635175958256503, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 1.2076, "step": 8396 }, { "epoch": 2.6354898191376672, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 1.2224, "step": 8397 }, { "epoch": 2.6358036800188316, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 1.3536, "step": 8398 }, { "epoch": 2.636117540899996, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 1.3716, "step": 8399 }, { "epoch": 2.6364314017811603, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 1.6506, "step": 8400 }, { "epoch": 2.636745262662325, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 1.519, "step": 8401 }, { "epoch": 2.6370591235434895, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 1.5257, "step": 8402 }, { "epoch": 2.637372984424654, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 1.3942, "step": 8403 }, { "epoch": 2.637686845305818, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 1.792, "step": 8404 }, { "epoch": 2.6380007061869826, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 1.801, "step": 8405 }, { "epoch": 2.638314567068147, "grad_norm": 0.9609375, "learning_rate": 0.0002, "loss": 2.1817, "step": 8406 }, { "epoch": 2.6386284279493113, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.4631, "step": 8407 }, { "epoch": 2.638942288830476, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 1.758, "step": 8408 }, { "epoch": 2.6392561497116405, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 1.6361, "step": 8409 }, { "epoch": 2.639570010592805, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 1.4907, "step": 8410 }, { "epoch": 2.639883871473969, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 1.9963, "step": 8411 }, { "epoch": 2.6401977323551336, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 1.8844, "step": 8412 }, { "epoch": 2.640511593236298, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 2.2396, "step": 8413 }, { "epoch": 2.6408254541174623, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 1.8646, "step": 8414 }, { "epoch": 2.641139314998627, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 1.4994, "step": 8415 }, { "epoch": 2.641139314998627, "eval_loss": 1.8945120573043823, "eval_runtime": 123.0456, "eval_samples_per_second": 8.127, "eval_steps_per_second": 8.127, "step": 8415 }, { "epoch": 2.641139314998627, "mmlu_eval_accuracy": 0.42608607294045037, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5714285714285714, "mmlu_eval_accuracy_astronomy": 0.4375, "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, "mmlu_eval_accuracy_clinical_knowledge": 0.5172413793103449, "mmlu_eval_accuracy_college_biology": 0.375, "mmlu_eval_accuracy_college_chemistry": 0.5, "mmlu_eval_accuracy_college_computer_science": 0.45454545454545453, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.36363636363636365, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.46153846153846156, "mmlu_eval_accuracy_econometrics": 0.08333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.375, "mmlu_eval_accuracy_elementary_mathematics": 0.24390243902439024, "mmlu_eval_accuracy_formal_logic": 0.42857142857142855, "mmlu_eval_accuracy_global_facts": 0.5, "mmlu_eval_accuracy_high_school_biology": 0.21875, "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365, "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, "mmlu_eval_accuracy_high_school_european_history": 0.4444444444444444, "mmlu_eval_accuracy_high_school_geography": 0.6363636363636364, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5238095238095238, "mmlu_eval_accuracy_high_school_macroeconomics": 0.46511627906976744, "mmlu_eval_accuracy_high_school_mathematics": 0.27586206896551724, "mmlu_eval_accuracy_high_school_microeconomics": 0.38461538461538464, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.6666666666666666, "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, "mmlu_eval_accuracy_high_school_us_history": 0.5454545454545454, "mmlu_eval_accuracy_high_school_world_history": 0.5, "mmlu_eval_accuracy_human_aging": 0.391304347826087, "mmlu_eval_accuracy_human_sexuality": 0.4166666666666667, "mmlu_eval_accuracy_international_law": 0.6923076923076923, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.5, "mmlu_eval_accuracy_machine_learning": 0.18181818181818182, "mmlu_eval_accuracy_management": 0.45454545454545453, "mmlu_eval_accuracy_marketing": 0.68, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.5581395348837209, "mmlu_eval_accuracy_moral_disputes": 0.4473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.24, "mmlu_eval_accuracy_nutrition": 0.45454545454545453, "mmlu_eval_accuracy_philosophy": 0.4117647058823529, "mmlu_eval_accuracy_prehistory": 0.34285714285714286, "mmlu_eval_accuracy_professional_accounting": 0.25806451612903225, "mmlu_eval_accuracy_professional_law": 0.38235294117647056, "mmlu_eval_accuracy_professional_medicine": 0.3548387096774194, "mmlu_eval_accuracy_professional_psychology": 0.4057971014492754, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.5454545454545454, "mmlu_eval_accuracy_us_foreign_policy": 0.7272727272727273, "mmlu_eval_accuracy_virology": 0.4444444444444444, "mmlu_eval_accuracy_world_religions": 0.631578947368421, "mmlu_loss": 1.2856478922550854, "step": 8415 }, { "epoch": 2.6414531758797914, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 1.4189, "step": 8416 }, { "epoch": 2.641767036760956, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.4305, "step": 8417 }, { "epoch": 2.64208089764212, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 1.4127, "step": 8418 }, { "epoch": 2.6423947585232845, "grad_norm": 0.90234375, "learning_rate": 0.0002, "loss": 1.9099, "step": 8419 }, { "epoch": 2.642708619404449, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 1.4152, "step": 8420 }, { "epoch": 2.6430224802856133, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 1.4639, "step": 8421 }, { "epoch": 2.643336341166778, "grad_norm": 1.296875, "learning_rate": 0.0002, "loss": 2.7808, "step": 8422 }, { "epoch": 2.643650202047942, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.7995, "step": 8423 }, { "epoch": 2.643964062929107, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.2992, "step": 8424 }, { "epoch": 2.644277923810271, "grad_norm": 0.1875, "learning_rate": 0.0002, "loss": 1.2542, "step": 8425 }, { "epoch": 2.6445917846914355, "grad_norm": 0.20703125, "learning_rate": 0.0002, "loss": 1.4125, "step": 8426 }, { "epoch": 2.6449056455726, "grad_norm": 0.2041015625, "learning_rate": 0.0002, "loss": 1.337, "step": 8427 }, { "epoch": 2.6452195064537642, "grad_norm": 0.22265625, "learning_rate": 0.0002, "loss": 1.2766, "step": 8428 }, { "epoch": 2.645533367334929, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.3633, "step": 8429 }, { "epoch": 2.645847228216093, "grad_norm": 0.1923828125, "learning_rate": 0.0002, "loss": 1.1436, "step": 8430 }, { "epoch": 2.646161089097258, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.3849, "step": 8431 }, { "epoch": 2.646474949978422, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.3329, "step": 8432 }, { "epoch": 2.6467888108595865, "grad_norm": 0.2412109375, "learning_rate": 0.0002, "loss": 1.2212, "step": 8433 }, { "epoch": 2.647102671740751, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.2438, "step": 8434 }, { "epoch": 2.6474165326219152, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.2411, "step": 8435 }, { "epoch": 2.6477303935030796, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.2334, "step": 8436 }, { "epoch": 2.648044254384244, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.3269, "step": 8437 }, { "epoch": 2.6483581152654088, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.1645, "step": 8438 }, { "epoch": 2.648671976146573, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 1.2039, "step": 8439 }, { "epoch": 2.6489858370277375, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 1.2391, "step": 8440 }, { "epoch": 2.649299697908902, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.0828, "step": 8441 }, { "epoch": 2.649613558790066, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.1592, "step": 8442 }, { "epoch": 2.6499274196712306, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 1.0751, "step": 8443 }, { "epoch": 2.650241280552395, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 1.3597, "step": 8444 }, { "epoch": 2.6505551414335597, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 1.1523, "step": 8445 }, { "epoch": 2.650869002314724, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 1.2061, "step": 8446 }, { "epoch": 2.6511828631958885, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 1.1965, "step": 8447 }, { "epoch": 2.651496724077053, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.1895, "step": 8448 }, { "epoch": 2.651810584958217, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 1.2136, "step": 8449 }, { "epoch": 2.6521244458393816, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 1.3367, "step": 8450 }, { "epoch": 2.652438306720546, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.3442, "step": 8451 }, { "epoch": 2.6527521676017107, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 1.2243, "step": 8452 }, { "epoch": 2.653066028482875, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 1.3153, "step": 8453 }, { "epoch": 2.6533798893640395, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 1.4304, "step": 8454 }, { "epoch": 2.653693750245204, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 1.5442, "step": 8455 }, { "epoch": 2.654007611126368, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 1.6766, "step": 8456 }, { "epoch": 2.6543214720075325, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 1.6185, "step": 8457 }, { "epoch": 2.654635332888697, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.6418, "step": 8458 }, { "epoch": 2.6549491937698617, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 1.632, "step": 8459 }, { "epoch": 2.655263054651026, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 1.765, "step": 8460 }, { "epoch": 2.6555769155321904, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 1.4777, "step": 8461 }, { "epoch": 2.655890776413355, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 1.6658, "step": 8462 }, { "epoch": 2.656204637294519, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 1.8833, "step": 8463 }, { "epoch": 2.6565184981756835, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 1.9558, "step": 8464 }, { "epoch": 2.656832359056848, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 1.868, "step": 8465 }, { "epoch": 2.6571462199380127, "grad_norm": 0.9921875, "learning_rate": 0.0002, "loss": 1.7597, "step": 8466 }, { "epoch": 2.657460080819177, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 1.4994, "step": 8467 }, { "epoch": 2.6577739417003414, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 1.6752, "step": 8468 }, { "epoch": 2.658087802581506, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 1.7291, "step": 8469 }, { "epoch": 2.65840166346267, "grad_norm": 0.87890625, "learning_rate": 0.0002, "loss": 1.6224, "step": 8470 }, { "epoch": 2.6587155243438345, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 1.856, "step": 8471 }, { "epoch": 2.659029385224999, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 2.2316, "step": 8472 }, { "epoch": 2.6593432461061637, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 1.7545, "step": 8473 }, { "epoch": 2.6596571069873276, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.257, "step": 8474 }, { "epoch": 2.6599709678684924, "grad_norm": 0.1875, "learning_rate": 0.0002, "loss": 1.1942, "step": 8475 }, { "epoch": 2.6602848287496568, "grad_norm": 0.2412109375, "learning_rate": 0.0002, "loss": 1.2967, "step": 8476 }, { "epoch": 2.660598689630821, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 1.1637, "step": 8477 }, { "epoch": 2.6609125505119855, "grad_norm": 0.2451171875, "learning_rate": 0.0002, "loss": 1.4163, "step": 8478 }, { "epoch": 2.66122641139315, "grad_norm": 0.2265625, "learning_rate": 0.0002, "loss": 1.2409, "step": 8479 }, { "epoch": 2.6615402722743147, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.3627, "step": 8480 }, { "epoch": 2.6618541331554786, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 1.3037, "step": 8481 }, { "epoch": 2.6621679940366434, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 1.1783, "step": 8482 }, { "epoch": 2.6624818549178078, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.3501, "step": 8483 }, { "epoch": 2.662795715798972, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 1.2841, "step": 8484 }, { "epoch": 2.6631095766801365, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.1642, "step": 8485 }, { "epoch": 2.663423437561301, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.1707, "step": 8486 }, { "epoch": 2.6637372984424657, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.2257, "step": 8487 }, { "epoch": 2.6640511593236296, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.2401, "step": 8488 }, { "epoch": 2.6643650202047944, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.1447, "step": 8489 }, { "epoch": 2.6646788810859587, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.1235, "step": 8490 }, { "epoch": 2.664992741967123, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 1.1939, "step": 8491 }, { "epoch": 2.6653066028482875, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 1.1468, "step": 8492 }, { "epoch": 2.665620463729452, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.0362, "step": 8493 }, { "epoch": 2.665934324610616, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.2149, "step": 8494 }, { "epoch": 2.6662481854917806, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 1.1619, "step": 8495 }, { "epoch": 2.6665620463729454, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 1.1317, "step": 8496 }, { "epoch": 2.6668759072541097, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 1.0797, "step": 8497 }, { "epoch": 2.667189768135274, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 1.3879, "step": 8498 }, { "epoch": 2.6675036290164384, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 1.2889, "step": 8499 }, { "epoch": 2.667817489897603, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 1.3283, "step": 8500 }, { "epoch": 2.668131350778767, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.3981, "step": 8501 }, { "epoch": 2.6684452116599315, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.3878, "step": 8502 }, { "epoch": 2.6687590725410963, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 1.5925, "step": 8503 }, { "epoch": 2.6690729334222607, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 1.44, "step": 8504 }, { "epoch": 2.669386794303425, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 1.6258, "step": 8505 }, { "epoch": 2.6697006551845894, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 1.4896, "step": 8506 }, { "epoch": 2.670014516065754, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 1.756, "step": 8507 }, { "epoch": 2.670328376946918, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 1.9529, "step": 8508 }, { "epoch": 2.6706422378280825, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.3928, "step": 8509 }, { "epoch": 2.6709560987092473, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.7558, "step": 8510 }, { "epoch": 2.6712699595904117, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 1.9714, "step": 8511 }, { "epoch": 2.671583820471576, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 1.3451, "step": 8512 }, { "epoch": 2.6718976813527404, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 1.7067, "step": 8513 }, { "epoch": 2.672211542233905, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 1.6845, "step": 8514 }, { "epoch": 2.672525403115069, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 1.6022, "step": 8515 }, { "epoch": 2.6728392639962335, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.9001, "step": 8516 }, { "epoch": 2.6731531248773983, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 1.679, "step": 8517 }, { "epoch": 2.6734669857585627, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 1.3794, "step": 8518 }, { "epoch": 2.673780846639727, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 1.6959, "step": 8519 }, { "epoch": 2.6740947075208914, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 1.7331, "step": 8520 }, { "epoch": 2.6744085684020558, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 1.7728, "step": 8521 }, { "epoch": 2.67472242928322, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 2.2475, "step": 8522 }, { "epoch": 2.6750362901643845, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.5638, "step": 8523 }, { "epoch": 2.6753501510455493, "grad_norm": 0.1884765625, "learning_rate": 0.0002, "loss": 1.3461, "step": 8524 }, { "epoch": 2.675664011926713, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 1.4559, "step": 8525 }, { "epoch": 2.675977872807878, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.3845, "step": 8526 }, { "epoch": 2.6762917336890424, "grad_norm": 0.2138671875, "learning_rate": 0.0002, "loss": 1.3858, "step": 8527 }, { "epoch": 2.6766055945702067, "grad_norm": 0.2392578125, "learning_rate": 0.0002, "loss": 1.4012, "step": 8528 }, { "epoch": 2.676919455451371, "grad_norm": 0.2373046875, "learning_rate": 0.0002, "loss": 1.4453, "step": 8529 }, { "epoch": 2.6772333163325355, "grad_norm": 0.2412109375, "learning_rate": 0.0002, "loss": 1.3059, "step": 8530 }, { "epoch": 2.6775471772137003, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.2132, "step": 8531 }, { "epoch": 2.677861038094864, "grad_norm": 0.2265625, "learning_rate": 0.0002, "loss": 1.1632, "step": 8532 }, { "epoch": 2.678174898976029, "grad_norm": 0.240234375, "learning_rate": 0.0002, "loss": 1.2249, "step": 8533 }, { "epoch": 2.6784887598571934, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 1.2381, "step": 8534 }, { "epoch": 2.6788026207383577, "grad_norm": 0.2490234375, "learning_rate": 0.0002, "loss": 1.151, "step": 8535 }, { "epoch": 2.679116481619522, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.1198, "step": 8536 }, { "epoch": 2.6794303425006865, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 1.4175, "step": 8537 }, { "epoch": 2.6797442033818513, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.1252, "step": 8538 }, { "epoch": 2.680058064263015, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.3283, "step": 8539 }, { "epoch": 2.68037192514418, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 1.1368, "step": 8540 }, { "epoch": 2.6806857860253444, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.1963, "step": 8541 }, { "epoch": 2.6809996469065087, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 1.2002, "step": 8542 }, { "epoch": 2.681313507787673, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 1.1619, "step": 8543 }, { "epoch": 2.6816273686688374, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 1.2311, "step": 8544 }, { "epoch": 2.681941229550002, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 1.2419, "step": 8545 }, { "epoch": 2.682255090431166, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 1.0106, "step": 8546 }, { "epoch": 2.682568951312331, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.2238, "step": 8547 }, { "epoch": 2.6828828121934953, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 1.2007, "step": 8548 }, { "epoch": 2.6831966730746597, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 1.1026, "step": 8549 }, { "epoch": 2.683510533955824, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 1.3844, "step": 8550 }, { "epoch": 2.6838243948369884, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.2921, "step": 8551 }, { "epoch": 2.684138255718153, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 1.3349, "step": 8552 }, { "epoch": 2.684452116599317, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 1.5137, "step": 8553 }, { "epoch": 2.684765977480482, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 1.6831, "step": 8554 }, { "epoch": 2.6850798383616463, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 1.8571, "step": 8555 }, { "epoch": 2.6853936992428107, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.4632, "step": 8556 }, { "epoch": 2.685707560123975, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 1.7118, "step": 8557 }, { "epoch": 2.6860214210051394, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 1.5354, "step": 8558 }, { "epoch": 2.6863352818863038, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 1.4313, "step": 8559 }, { "epoch": 2.686649142767468, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 1.7895, "step": 8560 }, { "epoch": 2.686963003648633, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 1.5443, "step": 8561 }, { "epoch": 2.6872768645297973, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 1.5314, "step": 8562 }, { "epoch": 2.6875907254109617, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 2.2068, "step": 8563 }, { "epoch": 2.687904586292126, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 1.7831, "step": 8564 }, { "epoch": 2.6882184471732904, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 1.4933, "step": 8565 }, { "epoch": 2.6885323080544548, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 1.6909, "step": 8566 }, { "epoch": 2.688846168935619, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 1.9929, "step": 8567 }, { "epoch": 2.689160029816784, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 1.6094, "step": 8568 }, { "epoch": 2.6894738906979483, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 1.3614, "step": 8569 }, { "epoch": 2.6897877515791127, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 1.5155, "step": 8570 }, { "epoch": 2.690101612460277, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 1.6672, "step": 8571 }, { "epoch": 2.6904154733414414, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 2.2199, "step": 8572 }, { "epoch": 2.6907293342226057, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.4084, "step": 8573 }, { "epoch": 2.69104319510377, "grad_norm": 0.2099609375, "learning_rate": 0.0002, "loss": 1.3214, "step": 8574 }, { "epoch": 2.691357055984935, "grad_norm": 0.22265625, "learning_rate": 0.0002, "loss": 1.2291, "step": 8575 }, { "epoch": 2.691670916866099, "grad_norm": 0.2236328125, "learning_rate": 0.0002, "loss": 1.3664, "step": 8576 }, { "epoch": 2.6919847777472636, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.2657, "step": 8577 }, { "epoch": 2.692298638628428, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.2366, "step": 8578 }, { "epoch": 2.6926124995095924, "grad_norm": 0.2275390625, "learning_rate": 0.0002, "loss": 1.1635, "step": 8579 }, { "epoch": 2.6929263603907567, "grad_norm": 0.248046875, "learning_rate": 0.0002, "loss": 1.3039, "step": 8580 }, { "epoch": 2.693240221271921, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.4299, "step": 8581 }, { "epoch": 2.693554082153086, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 1.2262, "step": 8582 }, { "epoch": 2.69386794303425, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.1847, "step": 8583 }, { "epoch": 2.6941818039154146, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.3661, "step": 8584 }, { "epoch": 2.694495664796579, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.2268, "step": 8585 }, { "epoch": 2.6948095256777433, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.2002, "step": 8586 }, { "epoch": 2.6951233865589077, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.3211, "step": 8587 }, { "epoch": 2.695437247440072, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.2702, "step": 8588 }, { "epoch": 2.695751108321237, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.1281, "step": 8589 }, { "epoch": 2.696064969202401, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.1183, "step": 8590 }, { "epoch": 2.6963788300835656, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.2107, "step": 8591 }, { "epoch": 2.69669269096473, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 1.2434, "step": 8592 }, { "epoch": 2.6970065518458943, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 1.1655, "step": 8593 }, { "epoch": 2.6973204127270587, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 1.0698, "step": 8594 }, { "epoch": 2.697634273608223, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.1282, "step": 8595 }, { "epoch": 2.6979481344893874, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.2471, "step": 8596 }, { "epoch": 2.698261995370552, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 1.2097, "step": 8597 }, { "epoch": 2.6985758562517166, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 1.2414, "step": 8598 }, { "epoch": 2.698889717132881, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 1.499, "step": 8599 }, { "epoch": 2.6992035780140453, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 1.347, "step": 8600 }, { "epoch": 2.6995174388952097, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 1.6096, "step": 8601 }, { "epoch": 2.699831299776374, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 1.3154, "step": 8602 }, { "epoch": 2.699831299776374, "eval_loss": 1.854783058166504, "eval_runtime": 123.9272, "eval_samples_per_second": 8.069, "eval_steps_per_second": 8.069, "step": 8602 }, { "epoch": 2.699831299776374, "mmlu_eval_accuracy": 0.41937990182100365, "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453, "mmlu_eval_accuracy_anatomy": 0.6428571428571429, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.2727272727272727, "mmlu_eval_accuracy_clinical_knowledge": 0.4482758620689655, "mmlu_eval_accuracy_college_biology": 0.5, "mmlu_eval_accuracy_college_chemistry": 0.25, "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.3181818181818182, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.18181818181818182, "mmlu_eval_accuracy_conceptual_physics": 0.46153846153846156, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.4375, "mmlu_eval_accuracy_elementary_mathematics": 0.1951219512195122, "mmlu_eval_accuracy_formal_logic": 0.35714285714285715, "mmlu_eval_accuracy_global_facts": 0.6, "mmlu_eval_accuracy_high_school_biology": 0.25, "mmlu_eval_accuracy_high_school_chemistry": 0.3181818181818182, "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, "mmlu_eval_accuracy_high_school_european_history": 0.5555555555555556, "mmlu_eval_accuracy_high_school_geography": 0.5, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5714285714285714, "mmlu_eval_accuracy_high_school_macroeconomics": 0.5348837209302325, "mmlu_eval_accuracy_high_school_mathematics": 0.2413793103448276, "mmlu_eval_accuracy_high_school_microeconomics": 0.38461538461538464, "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, "mmlu_eval_accuracy_high_school_psychology": 0.6833333333333333, "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, "mmlu_eval_accuracy_high_school_us_history": 0.5, "mmlu_eval_accuracy_high_school_world_history": 0.5, "mmlu_eval_accuracy_human_aging": 0.43478260869565216, "mmlu_eval_accuracy_human_sexuality": 0.25, "mmlu_eval_accuracy_international_law": 0.6923076923076923, "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, "mmlu_eval_accuracy_logical_fallacies": 0.4444444444444444, "mmlu_eval_accuracy_machine_learning": 0.09090909090909091, "mmlu_eval_accuracy_management": 0.5454545454545454, "mmlu_eval_accuracy_marketing": 0.6, "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273, "mmlu_eval_accuracy_miscellaneous": 0.5697674418604651, "mmlu_eval_accuracy_moral_disputes": 0.47368421052631576, "mmlu_eval_accuracy_moral_scenarios": 0.26, "mmlu_eval_accuracy_nutrition": 0.45454545454545453, "mmlu_eval_accuracy_philosophy": 0.4117647058823529, "mmlu_eval_accuracy_prehistory": 0.3142857142857143, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.32941176470588235, "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, "mmlu_eval_accuracy_professional_psychology": 0.4057971014492754, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.4074074074074074, "mmlu_eval_accuracy_sociology": 0.5454545454545454, "mmlu_eval_accuracy_us_foreign_policy": 0.7272727272727273, "mmlu_eval_accuracy_virology": 0.3333333333333333, "mmlu_eval_accuracy_world_religions": 0.631578947368421, "mmlu_loss": 1.2714938804225, "step": 8602 }, { "epoch": 2.7001451606575384, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 1.3572, "step": 8603 }, { "epoch": 2.7004590215387028, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.5309, "step": 8604 }, { "epoch": 2.7007728824198676, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.5098, "step": 8605 }, { "epoch": 2.701086743301032, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 1.4334, "step": 8606 }, { "epoch": 2.7014006041821963, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.9862, "step": 8607 }, { "epoch": 2.7017144650633607, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.5239, "step": 8608 }, { "epoch": 2.702028325944525, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 1.6678, "step": 8609 }, { "epoch": 2.7023421868256894, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 1.5955, "step": 8610 }, { "epoch": 2.7026560477068537, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 1.5368, "step": 8611 }, { "epoch": 2.7029699085880186, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 1.7202, "step": 8612 }, { "epoch": 2.703283769469183, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 1.43, "step": 8613 }, { "epoch": 2.7035976303503473, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 1.7292, "step": 8614 }, { "epoch": 2.7039114912315116, "grad_norm": 0.8828125, "learning_rate": 0.0002, "loss": 1.8022, "step": 8615 }, { "epoch": 2.704225352112676, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 1.688, "step": 8616 }, { "epoch": 2.7045392129938404, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 1.681, "step": 8617 }, { "epoch": 2.7048530738750047, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 1.8319, "step": 8618 }, { "epoch": 2.7051669347561695, "grad_norm": 1.40625, "learning_rate": 0.0002, "loss": 1.6245, "step": 8619 }, { "epoch": 2.705480795637334, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 1.5478, "step": 8620 }, { "epoch": 2.7057946565184983, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 1.5435, "step": 8621 }, { "epoch": 2.7061085173996626, "grad_norm": 0.8671875, "learning_rate": 0.0002, "loss": 2.2143, "step": 8622 }, { "epoch": 2.706422378280827, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 1.8024, "step": 8623 }, { "epoch": 2.7067362391619914, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.1995, "step": 8624 }, { "epoch": 2.7070501000431557, "grad_norm": 0.2041015625, "learning_rate": 0.0002, "loss": 1.2541, "step": 8625 }, { "epoch": 2.7073639609243205, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.3442, "step": 8626 }, { "epoch": 2.7076778218054844, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.3143, "step": 8627 }, { "epoch": 2.7079916826866492, "grad_norm": 0.2197265625, "learning_rate": 0.0002, "loss": 1.2439, "step": 8628 }, { "epoch": 2.7083055435678136, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.1414, "step": 8629 }, { "epoch": 2.708619404448978, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 1.0883, "step": 8630 }, { "epoch": 2.7089332653301423, "grad_norm": 0.2490234375, "learning_rate": 0.0002, "loss": 1.1546, "step": 8631 }, { "epoch": 2.7092471262113067, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.1885, "step": 8632 }, { "epoch": 2.7095609870924715, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.2473, "step": 8633 }, { "epoch": 2.7098748479736354, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.4387, "step": 8634 }, { "epoch": 2.7101887088548002, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.146, "step": 8635 }, { "epoch": 2.7105025697359646, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.2243, "step": 8636 }, { "epoch": 2.710816430617129, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.1487, "step": 8637 }, { "epoch": 2.7111302914982933, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.2089, "step": 8638 }, { "epoch": 2.7114441523794577, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.2721, "step": 8639 }, { "epoch": 2.7117580132606225, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.2621, "step": 8640 }, { "epoch": 2.7120718741417864, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.2357, "step": 8641 }, { "epoch": 2.712385735022951, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 1.1562, "step": 8642 }, { "epoch": 2.7126995959041156, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.2351, "step": 8643 }, { "epoch": 2.71301345678528, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.2223, "step": 8644 }, { "epoch": 2.7133273176664443, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.31, "step": 8645 }, { "epoch": 2.7136411785476087, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 1.1058, "step": 8646 }, { "epoch": 2.713955039428773, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 1.3404, "step": 8647 }, { "epoch": 2.7142689003099374, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 1.2708, "step": 8648 }, { "epoch": 2.714582761191102, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 1.177, "step": 8649 }, { "epoch": 2.7148966220722666, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.283, "step": 8650 }, { "epoch": 2.715210482953431, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.598, "step": 8651 }, { "epoch": 2.7155243438345953, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.2474, "step": 8652 }, { "epoch": 2.7158382047157597, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.5265, "step": 8653 }, { "epoch": 2.716152065596924, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 1.4124, "step": 8654 }, { "epoch": 2.7164659264780884, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 1.3375, "step": 8655 }, { "epoch": 2.716779787359253, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 1.524, "step": 8656 }, { "epoch": 2.7170936482404175, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 1.2545, "step": 8657 }, { "epoch": 2.717407509121582, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.5326, "step": 8658 }, { "epoch": 2.7177213700027463, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 1.6215, "step": 8659 }, { "epoch": 2.7180352308839106, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 1.4816, "step": 8660 }, { "epoch": 2.718349091765075, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.3333, "step": 8661 }, { "epoch": 2.7186629526462394, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 1.5719, "step": 8662 }, { "epoch": 2.718976813527404, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 1.8487, "step": 8663 }, { "epoch": 2.7192906744085685, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 1.7522, "step": 8664 }, { "epoch": 2.719604535289733, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.9769, "step": 8665 }, { "epoch": 2.7199183961708973, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 1.7196, "step": 8666 }, { "epoch": 2.7202322570520616, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 1.9253, "step": 8667 }, { "epoch": 2.720546117933226, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 1.5955, "step": 8668 }, { "epoch": 2.7208599788143903, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 1.5427, "step": 8669 }, { "epoch": 2.721173839695555, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 1.7613, "step": 8670 }, { "epoch": 2.7214877005767195, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 1.6044, "step": 8671 }, { "epoch": 2.721801561457884, "grad_norm": 0.90234375, "learning_rate": 0.0002, "loss": 2.3301, "step": 8672 }, { "epoch": 2.7221154223390482, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 1.7773, "step": 8673 }, { "epoch": 2.7224292832202126, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.1072, "step": 8674 }, { "epoch": 2.722743144101377, "grad_norm": 0.193359375, "learning_rate": 0.0002, "loss": 1.2099, "step": 8675 }, { "epoch": 2.7230570049825413, "grad_norm": 0.2099609375, "learning_rate": 0.0002, "loss": 1.2762, "step": 8676 }, { "epoch": 2.723370865863706, "grad_norm": 0.2021484375, "learning_rate": 0.0002, "loss": 1.1811, "step": 8677 }, { "epoch": 2.7236847267448705, "grad_norm": 0.212890625, "learning_rate": 0.0002, "loss": 1.3018, "step": 8678 }, { "epoch": 2.723998587626035, "grad_norm": 0.2138671875, "learning_rate": 0.0002, "loss": 1.3907, "step": 8679 }, { "epoch": 2.7243124485071992, "grad_norm": 0.2392578125, "learning_rate": 0.0002, "loss": 1.3295, "step": 8680 }, { "epoch": 2.7246263093883636, "grad_norm": 0.2314453125, "learning_rate": 0.0002, "loss": 1.1297, "step": 8681 }, { "epoch": 2.724940170269528, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.2774, "step": 8682 }, { "epoch": 2.7252540311506923, "grad_norm": 0.2451171875, "learning_rate": 0.0002, "loss": 1.2902, "step": 8683 }, { "epoch": 2.725567892031857, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.1716, "step": 8684 }, { "epoch": 2.725881752913021, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.2768, "step": 8685 }, { "epoch": 2.726195613794186, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 1.2164, "step": 8686 }, { "epoch": 2.72650947467535, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.2066, "step": 8687 }, { "epoch": 2.7268233355565146, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.1088, "step": 8688 }, { "epoch": 2.727137196437679, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.1148, "step": 8689 }, { "epoch": 2.7274510573188433, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.1722, "step": 8690 }, { "epoch": 2.727764918200008, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 1.3588, "step": 8691 }, { "epoch": 2.728078779081172, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.146, "step": 8692 }, { "epoch": 2.728392639962337, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.077, "step": 8693 }, { "epoch": 2.728706500843501, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.3593, "step": 8694 }, { "epoch": 2.7290203617246656, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.1837, "step": 8695 }, { "epoch": 2.72933422260583, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 1.3162, "step": 8696 }, { "epoch": 2.7296480834869943, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 1.398, "step": 8697 }, { "epoch": 2.729961944368159, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 0.9846, "step": 8698 }, { "epoch": 2.730275805249323, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 1.1597, "step": 8699 }, { "epoch": 2.730589666130488, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.2665, "step": 8700 }, { "epoch": 2.730903527011652, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 1.2117, "step": 8701 }, { "epoch": 2.7312173878928165, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.5063, "step": 8702 }, { "epoch": 2.731531248773981, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 1.4168, "step": 8703 }, { "epoch": 2.7318451096551453, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 1.8986, "step": 8704 }, { "epoch": 2.7321589705363096, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 1.4895, "step": 8705 }, { "epoch": 2.732472831417474, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.3423, "step": 8706 }, { "epoch": 2.732786692298639, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 1.7741, "step": 8707 }, { "epoch": 2.733100553179803, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 1.3557, "step": 8708 }, { "epoch": 2.7334144140609675, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.4249, "step": 8709 }, { "epoch": 2.733728274942132, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 1.7385, "step": 8710 }, { "epoch": 2.7340421358232962, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 1.7876, "step": 8711 }, { "epoch": 2.7343559967044606, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 1.7375, "step": 8712 }, { "epoch": 2.734669857585625, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.5032, "step": 8713 }, { "epoch": 2.73498371846679, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 1.5102, "step": 8714 }, { "epoch": 2.735297579347954, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.9412, "step": 8715 }, { "epoch": 2.7356114402291185, "grad_norm": 0.93359375, "learning_rate": 0.0002, "loss": 2.1166, "step": 8716 }, { "epoch": 2.735925301110283, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 1.4727, "step": 8717 }, { "epoch": 2.7362391619914472, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 1.4201, "step": 8718 }, { "epoch": 2.7365530228726116, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 1.4954, "step": 8719 }, { "epoch": 2.736866883753776, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 1.4639, "step": 8720 }, { "epoch": 2.7371807446349408, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 1.8064, "step": 8721 }, { "epoch": 2.737494605516105, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 2.2398, "step": 8722 }, { "epoch": 2.7378084663972695, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.4036, "step": 8723 }, { "epoch": 2.738122327278434, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.2911, "step": 8724 }, { "epoch": 2.738436188159598, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.3762, "step": 8725 }, { "epoch": 2.7387500490407626, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.3825, "step": 8726 }, { "epoch": 2.739063909921927, "grad_norm": 0.228515625, "learning_rate": 0.0002, "loss": 1.2075, "step": 8727 }, { "epoch": 2.7393777708030917, "grad_norm": 0.2470703125, "learning_rate": 0.0002, "loss": 1.2913, "step": 8728 }, { "epoch": 2.739691631684256, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.2767, "step": 8729 }, { "epoch": 2.7400054925654205, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.3512, "step": 8730 }, { "epoch": 2.740319353446585, "grad_norm": 0.2197265625, "learning_rate": 0.0002, "loss": 1.1116, "step": 8731 }, { "epoch": 2.740633214327749, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.3591, "step": 8732 }, { "epoch": 2.7409470752089136, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.3166, "step": 8733 }, { "epoch": 2.741260936090078, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.2564, "step": 8734 }, { "epoch": 2.7415747969712427, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.2005, "step": 8735 }, { "epoch": 2.7418886578524067, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.2311, "step": 8736 }, { "epoch": 2.7422025187335715, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.13, "step": 8737 }, { "epoch": 2.742516379614736, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.3589, "step": 8738 }, { "epoch": 2.7428302404959, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 1.3245, "step": 8739 }, { "epoch": 2.7431441013770645, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 1.2068, "step": 8740 }, { "epoch": 2.743457962258229, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.2018, "step": 8741 }, { "epoch": 2.7437718231393937, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.1263, "step": 8742 }, { "epoch": 2.7440856840205576, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 1.2845, "step": 8743 }, { "epoch": 2.7443995449017224, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 1.1948, "step": 8744 }, { "epoch": 2.744713405782887, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 1.2739, "step": 8745 }, { "epoch": 2.745027266664051, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 1.1993, "step": 8746 }, { "epoch": 2.7453411275452155, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.1774, "step": 8747 }, { "epoch": 2.74565498842638, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.2651, "step": 8748 }, { "epoch": 2.7459688493075447, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 1.2833, "step": 8749 }, { "epoch": 2.7462827101887086, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 1.4769, "step": 8750 }, { "epoch": 2.7465965710698734, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 1.5504, "step": 8751 }, { "epoch": 2.746910431951038, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.1777, "step": 8752 }, { "epoch": 2.747224292832202, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 1.576, "step": 8753 }, { "epoch": 2.7475381537133665, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.3775, "step": 8754 }, { "epoch": 2.747852014594531, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 1.4437, "step": 8755 }, { "epoch": 2.7481658754756952, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 1.6717, "step": 8756 }, { "epoch": 2.7484797363568596, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 1.7627, "step": 8757 }, { "epoch": 2.7487935972380244, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 1.5599, "step": 8758 }, { "epoch": 2.7491074581191888, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 1.6528, "step": 8759 }, { "epoch": 2.749421319000353, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 1.5964, "step": 8760 }, { "epoch": 2.7497351798815175, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 1.9533, "step": 8761 }, { "epoch": 2.750049040762682, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 1.6586, "step": 8762 }, { "epoch": 2.7503629016438462, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 1.8038, "step": 8763 }, { "epoch": 2.7506767625250106, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 1.8532, "step": 8764 }, { "epoch": 2.7509906234061754, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 2.0524, "step": 8765 }, { "epoch": 2.7513044842873398, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 2.0539, "step": 8766 }, { "epoch": 2.751618345168504, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 1.825, "step": 8767 }, { "epoch": 2.7519322060496685, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.5484, "step": 8768 }, { "epoch": 2.752246066930833, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.593, "step": 8769 }, { "epoch": 2.752559927811997, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 1.2336, "step": 8770 }, { "epoch": 2.7528737886931616, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 1.4015, "step": 8771 }, { "epoch": 2.7531876495743264, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 2.1122, "step": 8772 }, { "epoch": 2.7535015104554907, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.4368, "step": 8773 }, { "epoch": 2.753815371336655, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.3118, "step": 8774 }, { "epoch": 2.7541292322178195, "grad_norm": 0.2373046875, "learning_rate": 0.0002, "loss": 1.2818, "step": 8775 }, { "epoch": 2.754443093098984, "grad_norm": 0.2412109375, "learning_rate": 0.0002, "loss": 1.3451, "step": 8776 }, { "epoch": 2.754756953980148, "grad_norm": 0.2265625, "learning_rate": 0.0002, "loss": 1.3147, "step": 8777 }, { "epoch": 2.7550708148613126, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.4598, "step": 8778 }, { "epoch": 2.7553846757424774, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 1.2215, "step": 8779 }, { "epoch": 2.7556985366236417, "grad_norm": 0.234375, "learning_rate": 0.0002, "loss": 1.394, "step": 8780 }, { "epoch": 2.756012397504806, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.2969, "step": 8781 }, { "epoch": 2.7563262583859705, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.3555, "step": 8782 }, { "epoch": 2.756640119267135, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.3092, "step": 8783 }, { "epoch": 2.756953980148299, "grad_norm": 0.248046875, "learning_rate": 0.0002, "loss": 1.3651, "step": 8784 }, { "epoch": 2.7572678410294635, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.2268, "step": 8785 }, { "epoch": 2.7575817019106283, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.3241, "step": 8786 }, { "epoch": 2.7578955627917923, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.1267, "step": 8787 }, { "epoch": 2.758209423672957, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.1252, "step": 8788 }, { "epoch": 2.7585232845541214, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.1699, "step": 8789 }, { "epoch": 2.7585232845541214, "eval_loss": 1.8641449213027954, "eval_runtime": 123.7639, "eval_samples_per_second": 8.08, "eval_steps_per_second": 8.08, "step": 8789 }, { "epoch": 2.7585232845541214, "mmlu_eval_accuracy": 0.4237500740786868, "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, "mmlu_eval_accuracy_anatomy": 0.5714285714285714, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, "mmlu_eval_accuracy_clinical_knowledge": 0.5172413793103449, "mmlu_eval_accuracy_college_biology": 0.375, "mmlu_eval_accuracy_college_chemistry": 0.0, "mmlu_eval_accuracy_college_computer_science": 0.36363636363636365, "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, "mmlu_eval_accuracy_college_medicine": 0.4090909090909091, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.45454545454545453, "mmlu_eval_accuracy_conceptual_physics": 0.34615384615384615, "mmlu_eval_accuracy_econometrics": 0.25, "mmlu_eval_accuracy_electrical_engineering": 0.375, "mmlu_eval_accuracy_elementary_mathematics": 0.24390243902439024, "mmlu_eval_accuracy_formal_logic": 0.2857142857142857, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.25, "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556, "mmlu_eval_accuracy_high_school_european_history": 0.5555555555555556, "mmlu_eval_accuracy_high_school_geography": 0.5454545454545454, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5714285714285714, "mmlu_eval_accuracy_high_school_macroeconomics": 0.46511627906976744, "mmlu_eval_accuracy_high_school_mathematics": 0.27586206896551724, "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, "mmlu_eval_accuracy_high_school_psychology": 0.6666666666666666, "mmlu_eval_accuracy_high_school_statistics": 0.13043478260869565, "mmlu_eval_accuracy_high_school_us_history": 0.5909090909090909, "mmlu_eval_accuracy_high_school_world_history": 0.4230769230769231, "mmlu_eval_accuracy_human_aging": 0.5217391304347826, "mmlu_eval_accuracy_human_sexuality": 0.25, "mmlu_eval_accuracy_international_law": 0.8461538461538461, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.5, "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, "mmlu_eval_accuracy_management": 0.5454545454545454, "mmlu_eval_accuracy_marketing": 0.68, "mmlu_eval_accuracy_medical_genetics": 0.6363636363636364, "mmlu_eval_accuracy_miscellaneous": 0.5116279069767442, "mmlu_eval_accuracy_moral_disputes": 0.5263157894736842, "mmlu_eval_accuracy_moral_scenarios": 0.3, "mmlu_eval_accuracy_nutrition": 0.45454545454545453, "mmlu_eval_accuracy_philosophy": 0.35294117647058826, "mmlu_eval_accuracy_prehistory": 0.34285714285714286, "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, "mmlu_eval_accuracy_professional_law": 0.35294117647058826, "mmlu_eval_accuracy_professional_medicine": 0.3548387096774194, "mmlu_eval_accuracy_professional_psychology": 0.37681159420289856, "mmlu_eval_accuracy_public_relations": 0.5833333333333334, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.45454545454545453, "mmlu_eval_accuracy_us_foreign_policy": 0.6363636363636364, "mmlu_eval_accuracy_virology": 0.2777777777777778, "mmlu_eval_accuracy_world_religions": 0.631578947368421, "mmlu_loss": 1.1843531046775648, "step": 8789 }, { "epoch": 2.758837145435286, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.1949, "step": 8790 }, { "epoch": 2.75915100631645, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.3141, "step": 8791 }, { "epoch": 2.7594648671976145, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 1.1525, "step": 8792 }, { "epoch": 2.7597787280787793, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 1.098, "step": 8793 }, { "epoch": 2.7600925889599432, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 1.1569, "step": 8794 }, { "epoch": 2.760406449841108, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 1.2112, "step": 8795 }, { "epoch": 2.7607203107222724, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 1.2274, "step": 8796 }, { "epoch": 2.761034171603437, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 1.118, "step": 8797 }, { "epoch": 2.761348032484601, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 1.1078, "step": 8798 }, { "epoch": 2.7616618933657655, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 1.2266, "step": 8799 }, { "epoch": 2.7619757542469303, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.2176, "step": 8800 }, { "epoch": 2.7622896151280942, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 1.4884, "step": 8801 }, { "epoch": 2.762603476009259, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 1.5496, "step": 8802 }, { "epoch": 2.7629173368904234, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 1.5419, "step": 8803 }, { "epoch": 2.7632311977715878, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 1.4183, "step": 8804 }, { "epoch": 2.763545058652752, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.6199, "step": 8805 }, { "epoch": 2.7638589195339165, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 1.4599, "step": 8806 }, { "epoch": 2.764172780415081, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 1.8484, "step": 8807 }, { "epoch": 2.764486641296245, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 1.4786, "step": 8808 }, { "epoch": 2.76480050217741, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.5282, "step": 8809 }, { "epoch": 2.7651143630585744, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 1.724, "step": 8810 }, { "epoch": 2.7654282239397387, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 1.8674, "step": 8811 }, { "epoch": 2.765742084820903, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 1.9523, "step": 8812 }, { "epoch": 2.7660559457020675, "grad_norm": 0.95703125, "learning_rate": 0.0002, "loss": 1.8308, "step": 8813 }, { "epoch": 2.766369806583232, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 1.7908, "step": 8814 }, { "epoch": 2.766683667464396, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 1.5607, "step": 8815 }, { "epoch": 2.766997528345561, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 1.3884, "step": 8816 }, { "epoch": 2.7673113892267254, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 2.0593, "step": 8817 }, { "epoch": 2.7676252501078897, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 1.4634, "step": 8818 }, { "epoch": 2.767939110989054, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 1.4195, "step": 8819 }, { "epoch": 2.7682529718702185, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 1.6989, "step": 8820 }, { "epoch": 2.768566832751383, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 1.6499, "step": 8821 }, { "epoch": 2.768880693632547, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 2.0474, "step": 8822 }, { "epoch": 2.769194554513712, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.6071, "step": 8823 }, { "epoch": 2.7695084153948764, "grad_norm": 0.2060546875, "learning_rate": 0.0002, "loss": 1.2207, "step": 8824 }, { "epoch": 2.7698222762760407, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 1.3999, "step": 8825 }, { "epoch": 2.770136137157205, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.2489, "step": 8826 }, { "epoch": 2.7704499980383694, "grad_norm": 0.23828125, "learning_rate": 0.0002, "loss": 1.4141, "step": 8827 }, { "epoch": 2.770763858919534, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.218, "step": 8828 }, { "epoch": 2.771077719800698, "grad_norm": 0.23828125, "learning_rate": 0.0002, "loss": 1.3598, "step": 8829 }, { "epoch": 2.771391580681863, "grad_norm": 0.2490234375, "learning_rate": 0.0002, "loss": 1.2875, "step": 8830 }, { "epoch": 2.7717054415630273, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 1.2131, "step": 8831 }, { "epoch": 2.7720193024441917, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.3535, "step": 8832 }, { "epoch": 2.772333163325356, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.33, "step": 8833 }, { "epoch": 2.7726470242065204, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.4125, "step": 8834 }, { "epoch": 2.772960885087685, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.3441, "step": 8835 }, { "epoch": 2.773274745968849, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.2537, "step": 8836 }, { "epoch": 2.773588606850014, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.1943, "step": 8837 }, { "epoch": 2.773902467731178, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.0238, "step": 8838 }, { "epoch": 2.7742163286123427, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.2097, "step": 8839 }, { "epoch": 2.774530189493507, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 1.0903, "step": 8840 }, { "epoch": 2.7748440503746714, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.2512, "step": 8841 }, { "epoch": 2.7751579112558358, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.2876, "step": 8842 }, { "epoch": 2.775471772137, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 1.1895, "step": 8843 }, { "epoch": 2.775785633018165, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.2201, "step": 8844 }, { "epoch": 2.776099493899329, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 1.5158, "step": 8845 }, { "epoch": 2.7764133547804937, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.1696, "step": 8846 }, { "epoch": 2.776727215661658, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.1493, "step": 8847 }, { "epoch": 2.7770410765428224, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 1.2045, "step": 8848 }, { "epoch": 2.7773549374239868, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.1858, "step": 8849 }, { "epoch": 2.777668798305151, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 1.25, "step": 8850 }, { "epoch": 2.777982659186316, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 1.3839, "step": 8851 }, { "epoch": 2.77829652006748, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 1.2745, "step": 8852 }, { "epoch": 2.7786103809486447, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 1.6005, "step": 8853 }, { "epoch": 2.778924241829809, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.3623, "step": 8854 }, { "epoch": 2.7792381027109734, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 1.5142, "step": 8855 }, { "epoch": 2.7795519635921377, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 1.6793, "step": 8856 }, { "epoch": 2.779865824473302, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 1.5301, "step": 8857 }, { "epoch": 2.7801796853544665, "grad_norm": 1.875, "learning_rate": 0.0002, "loss": 1.7962, "step": 8858 }, { "epoch": 2.780493546235631, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 1.4824, "step": 8859 }, { "epoch": 2.7808074071167956, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 1.8273, "step": 8860 }, { "epoch": 2.78112126799796, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 1.6435, "step": 8861 }, { "epoch": 2.7814351288791244, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 1.5797, "step": 8862 }, { "epoch": 2.7817489897602887, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 2.2116, "step": 8863 }, { "epoch": 2.782062850641453, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 1.72, "step": 8864 }, { "epoch": 2.7823767115226175, "grad_norm": 0.921875, "learning_rate": 0.0002, "loss": 1.8395, "step": 8865 }, { "epoch": 2.782690572403782, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 1.8089, "step": 8866 }, { "epoch": 2.7830044332849466, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 1.4679, "step": 8867 }, { "epoch": 2.783318294166111, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 1.5088, "step": 8868 }, { "epoch": 2.7836321550472753, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 1.5487, "step": 8869 }, { "epoch": 2.7839460159284397, "grad_norm": 0.98828125, "learning_rate": 0.0002, "loss": 1.7214, "step": 8870 }, { "epoch": 2.784259876809604, "grad_norm": 0.97265625, "learning_rate": 0.0002, "loss": 1.8389, "step": 8871 }, { "epoch": 2.7845737376907684, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 2.1598, "step": 8872 }, { "epoch": 2.784887598571933, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.6685, "step": 8873 }, { "epoch": 2.7852014594530976, "grad_norm": 0.2197265625, "learning_rate": 0.0002, "loss": 1.3307, "step": 8874 }, { "epoch": 2.785515320334262, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.4261, "step": 8875 }, { "epoch": 2.7858291812154263, "grad_norm": 0.212890625, "learning_rate": 0.0002, "loss": 1.2749, "step": 8876 }, { "epoch": 2.7861430420965907, "grad_norm": 0.2216796875, "learning_rate": 0.0002, "loss": 1.3086, "step": 8877 }, { "epoch": 2.786456902977755, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.238, "step": 8878 }, { "epoch": 2.7867707638589194, "grad_norm": 0.22265625, "learning_rate": 0.0002, "loss": 1.2043, "step": 8879 }, { "epoch": 2.787084624740084, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.2158, "step": 8880 }, { "epoch": 2.7873984856212486, "grad_norm": 0.2373046875, "learning_rate": 0.0002, "loss": 1.2323, "step": 8881 }, { "epoch": 2.787712346502413, "grad_norm": 0.2470703125, "learning_rate": 0.0002, "loss": 1.3248, "step": 8882 }, { "epoch": 2.7880262073835773, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.2562, "step": 8883 }, { "epoch": 2.7883400682647417, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.2253, "step": 8884 }, { "epoch": 2.788653929145906, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 1.2782, "step": 8885 }, { "epoch": 2.7889677900270704, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.1852, "step": 8886 }, { "epoch": 2.7892816509082348, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.1523, "step": 8887 }, { "epoch": 2.7895955117893996, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.1112, "step": 8888 }, { "epoch": 2.789909372670564, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 1.2292, "step": 8889 }, { "epoch": 2.7902232335517283, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.3274, "step": 8890 }, { "epoch": 2.7905370944328927, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.0913, "step": 8891 }, { "epoch": 2.790850955314057, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.1233, "step": 8892 }, { "epoch": 2.7911648161952214, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.0499, "step": 8893 }, { "epoch": 2.7914786770763858, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 1.2103, "step": 8894 }, { "epoch": 2.7917925379575506, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 1.2019, "step": 8895 }, { "epoch": 2.7921063988387145, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 0.9751, "step": 8896 }, { "epoch": 2.7924202597198793, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 1.1311, "step": 8897 }, { "epoch": 2.7927341206010436, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 1.3694, "step": 8898 }, { "epoch": 2.793047981482208, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 1.3614, "step": 8899 }, { "epoch": 2.7933618423633724, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 1.3947, "step": 8900 }, { "epoch": 2.7936757032445367, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.2302, "step": 8901 }, { "epoch": 2.7939895641257015, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 1.2091, "step": 8902 }, { "epoch": 2.7943034250068655, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 1.5978, "step": 8903 }, { "epoch": 2.7946172858880303, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.4075, "step": 8904 }, { "epoch": 2.7949311467691946, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.4637, "step": 8905 }, { "epoch": 2.795245007650359, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 1.5931, "step": 8906 }, { "epoch": 2.7955588685315234, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 1.6932, "step": 8907 }, { "epoch": 2.7958727294126877, "grad_norm": 0.96484375, "learning_rate": 0.0002, "loss": 1.3757, "step": 8908 }, { "epoch": 2.7961865902938525, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 1.7842, "step": 8909 }, { "epoch": 2.7965004511750164, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 1.8431, "step": 8910 }, { "epoch": 2.7968143120561813, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 1.7284, "step": 8911 }, { "epoch": 2.7971281729373456, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 1.5956, "step": 8912 }, { "epoch": 2.79744203381851, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 1.3859, "step": 8913 }, { "epoch": 2.7977558946996743, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 1.7986, "step": 8914 }, { "epoch": 2.7980697555808387, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 1.4169, "step": 8915 }, { "epoch": 2.798383616462003, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 1.7558, "step": 8916 }, { "epoch": 2.7986974773431674, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 1.8279, "step": 8917 }, { "epoch": 2.7990113382243322, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.501, "step": 8918 }, { "epoch": 2.7993251991054966, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.3465, "step": 8919 }, { "epoch": 2.799639059986661, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 1.6286, "step": 8920 }, { "epoch": 2.7999529208678253, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 1.461, "step": 8921 }, { "epoch": 2.8002667817489897, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 2.1564, "step": 8922 }, { "epoch": 2.800580642630154, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 1.6156, "step": 8923 }, { "epoch": 2.8008945035113184, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 1.3794, "step": 8924 }, { "epoch": 2.801208364392483, "grad_norm": 0.24609375, "learning_rate": 0.0002, "loss": 1.2563, "step": 8925 }, { "epoch": 2.8015222252736476, "grad_norm": 0.212890625, "learning_rate": 0.0002, "loss": 1.3096, "step": 8926 }, { "epoch": 2.801836086154812, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 1.3823, "step": 8927 }, { "epoch": 2.8021499470359763, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.2788, "step": 8928 }, { "epoch": 2.8024638079171407, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.2653, "step": 8929 }, { "epoch": 2.802777668798305, "grad_norm": 0.2333984375, "learning_rate": 0.0002, "loss": 1.219, "step": 8930 }, { "epoch": 2.8030915296794694, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.3238, "step": 8931 }, { "epoch": 2.803405390560634, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.2924, "step": 8932 }, { "epoch": 2.8037192514417986, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 1.1432, "step": 8933 }, { "epoch": 2.804033112322963, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.1038, "step": 8934 }, { "epoch": 2.8043469732041273, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.2166, "step": 8935 }, { "epoch": 2.8046608340852917, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 1.364, "step": 8936 }, { "epoch": 2.804974694966456, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.2222, "step": 8937 }, { "epoch": 2.8052885558476204, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.197, "step": 8938 }, { "epoch": 2.805602416728785, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.0971, "step": 8939 }, { "epoch": 2.8059162776099495, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.1964, "step": 8940 }, { "epoch": 2.806230138491114, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 1.2351, "step": 8941 }, { "epoch": 2.8065439993722783, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.2927, "step": 8942 }, { "epoch": 2.8068578602534426, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 1.431, "step": 8943 }, { "epoch": 2.807171721134607, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 1.2947, "step": 8944 }, { "epoch": 2.8074855820157714, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 1.2102, "step": 8945 }, { "epoch": 2.807799442896936, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 1.2571, "step": 8946 }, { "epoch": 2.8081133037781, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.1792, "step": 8947 }, { "epoch": 2.808427164659265, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.0224, "step": 8948 }, { "epoch": 2.8087410255404293, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.2309, "step": 8949 }, { "epoch": 2.8090548864215936, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 1.2926, "step": 8950 }, { "epoch": 2.809368747302758, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.4216, "step": 8951 }, { "epoch": 2.8096826081839223, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 1.3783, "step": 8952 }, { "epoch": 2.809996469065087, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.3547, "step": 8953 }, { "epoch": 2.810310329946251, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 1.4433, "step": 8954 }, { "epoch": 2.810624190827416, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 1.7898, "step": 8955 }, { "epoch": 2.8109380517085802, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 1.6209, "step": 8956 }, { "epoch": 2.8112519125897446, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 1.4303, "step": 8957 }, { "epoch": 2.811565773470909, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 1.8104, "step": 8958 }, { "epoch": 2.8118796343520733, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 1.4039, "step": 8959 }, { "epoch": 2.812193495233238, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 1.4615, "step": 8960 }, { "epoch": 2.812507356114402, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 1.9492, "step": 8961 }, { "epoch": 2.812821216995567, "grad_norm": 0.9140625, "learning_rate": 0.0002, "loss": 2.0747, "step": 8962 }, { "epoch": 2.8131350778767312, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 1.9825, "step": 8963 }, { "epoch": 2.8134489387578956, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.585, "step": 8964 }, { "epoch": 2.81376279963906, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 1.6743, "step": 8965 }, { "epoch": 2.8140766605202243, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 1.5009, "step": 8966 }, { "epoch": 2.8143905214013887, "grad_norm": 0.8671875, "learning_rate": 0.0002, "loss": 2.0763, "step": 8967 }, { "epoch": 2.814704382282553, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 1.8759, "step": 8968 }, { "epoch": 2.815018243163718, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 1.5192, "step": 8969 }, { "epoch": 2.815332104044882, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 1.8401, "step": 8970 }, { "epoch": 2.8156459649260466, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 1.3803, "step": 8971 }, { "epoch": 2.815959825807211, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 2.1544, "step": 8972 }, { "epoch": 2.8162736866883753, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 1.914, "step": 8973 }, { "epoch": 2.8165875475695397, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.0397, "step": 8974 }, { "epoch": 2.816901408450704, "grad_norm": 0.1884765625, "learning_rate": 0.0002, "loss": 1.2654, "step": 8975 }, { "epoch": 2.817215269331869, "grad_norm": 0.248046875, "learning_rate": 0.0002, "loss": 1.3141, "step": 8976 }, { "epoch": 2.817215269331869, "eval_loss": 1.8765534162521362, "eval_runtime": 123.8379, "eval_samples_per_second": 8.075, "eval_steps_per_second": 8.075, "step": 8976 }, { "epoch": 2.817215269331869, "mmlu_eval_accuracy": 0.43177612834492324, "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, "mmlu_eval_accuracy_anatomy": 0.6428571428571429, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.2727272727272727, "mmlu_eval_accuracy_clinical_knowledge": 0.4827586206896552, "mmlu_eval_accuracy_college_biology": 0.4375, "mmlu_eval_accuracy_college_chemistry": 0.5, "mmlu_eval_accuracy_college_computer_science": 0.36363636363636365, "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, "mmlu_eval_accuracy_college_medicine": 0.3181818181818182, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.5454545454545454, "mmlu_eval_accuracy_conceptual_physics": 0.4230769230769231, "mmlu_eval_accuracy_econometrics": 0.08333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.5, "mmlu_eval_accuracy_elementary_mathematics": 0.21951219512195122, "mmlu_eval_accuracy_formal_logic": 0.21428571428571427, "mmlu_eval_accuracy_global_facts": 0.4, "mmlu_eval_accuracy_high_school_biology": 0.34375, "mmlu_eval_accuracy_high_school_chemistry": 0.3181818181818182, "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556, "mmlu_eval_accuracy_high_school_european_history": 0.5, "mmlu_eval_accuracy_high_school_geography": 0.5909090909090909, "mmlu_eval_accuracy_high_school_government_and_politics": 0.47619047619047616, "mmlu_eval_accuracy_high_school_macroeconomics": 0.5348837209302325, "mmlu_eval_accuracy_high_school_mathematics": 0.20689655172413793, "mmlu_eval_accuracy_high_school_microeconomics": 0.5, "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, "mmlu_eval_accuracy_high_school_psychology": 0.6666666666666666, "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, "mmlu_eval_accuracy_high_school_us_history": 0.5454545454545454, "mmlu_eval_accuracy_high_school_world_history": 0.4230769230769231, "mmlu_eval_accuracy_human_aging": 0.34782608695652173, "mmlu_eval_accuracy_human_sexuality": 0.4166666666666667, "mmlu_eval_accuracy_international_law": 0.7692307692307693, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.5555555555555556, "mmlu_eval_accuracy_machine_learning": 0.45454545454545453, "mmlu_eval_accuracy_management": 0.5454545454545454, "mmlu_eval_accuracy_marketing": 0.64, "mmlu_eval_accuracy_medical_genetics": 0.6363636363636364, "mmlu_eval_accuracy_miscellaneous": 0.5465116279069767, "mmlu_eval_accuracy_moral_disputes": 0.47368421052631576, "mmlu_eval_accuracy_moral_scenarios": 0.23, "mmlu_eval_accuracy_nutrition": 0.48484848484848486, "mmlu_eval_accuracy_philosophy": 0.38235294117647056, "mmlu_eval_accuracy_prehistory": 0.37142857142857144, "mmlu_eval_accuracy_professional_accounting": 0.25806451612903225, "mmlu_eval_accuracy_professional_law": 0.3, "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, "mmlu_eval_accuracy_professional_psychology": 0.36231884057971014, "mmlu_eval_accuracy_public_relations": 0.5, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.45454545454545453, "mmlu_eval_accuracy_us_foreign_policy": 0.7272727272727273, "mmlu_eval_accuracy_virology": 0.3333333333333333, "mmlu_eval_accuracy_world_religions": 0.6842105263157895, "mmlu_loss": 1.1526709688324932, "step": 8976 }, { "epoch": 2.817529130213033, "grad_norm": 0.2197265625, "learning_rate": 0.0002, "loss": 1.3155, "step": 8977 }, { "epoch": 2.8178429910941976, "grad_norm": 0.220703125, "learning_rate": 0.0002, "loss": 1.3944, "step": 8978 }, { "epoch": 2.818156851975362, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.3358, "step": 8979 }, { "epoch": 2.8184707128565263, "grad_norm": 0.2470703125, "learning_rate": 0.0002, "loss": 1.3133, "step": 8980 }, { "epoch": 2.8187845737376906, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 1.1682, "step": 8981 }, { "epoch": 2.819098434618855, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.3518, "step": 8982 }, { "epoch": 2.81941229550002, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.2919, "step": 8983 }, { "epoch": 2.819726156381184, "grad_norm": 0.244140625, "learning_rate": 0.0002, "loss": 1.112, "step": 8984 }, { "epoch": 2.8200400172623485, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.3675, "step": 8985 }, { "epoch": 2.820353878143513, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.2422, "step": 8986 }, { "epoch": 2.8206677390246773, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.2066, "step": 8987 }, { "epoch": 2.8209815999058416, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.2395, "step": 8988 }, { "epoch": 2.821295460787006, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.2021, "step": 8989 }, { "epoch": 2.821609321668171, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.3239, "step": 8990 }, { "epoch": 2.821923182549335, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 1.3449, "step": 8991 }, { "epoch": 2.8222370434304995, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.0573, "step": 8992 }, { "epoch": 2.822550904311664, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.0472, "step": 8993 }, { "epoch": 2.8228647651928283, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 1.1821, "step": 8994 }, { "epoch": 2.8231786260739926, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 1.057, "step": 8995 }, { "epoch": 2.823492486955157, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 1.2382, "step": 8996 }, { "epoch": 2.823806347836322, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 1.3752, "step": 8997 }, { "epoch": 2.8241202087174857, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 1.4838, "step": 8998 }, { "epoch": 2.8244340695986505, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 1.191, "step": 8999 }, { "epoch": 2.824747930479815, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 1.3935, "step": 9000 }, { "epoch": 2.8250617913609792, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 1.6454, "step": 9001 }, { "epoch": 2.8253756522421436, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 1.2102, "step": 9002 }, { "epoch": 2.825689513123308, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 1.2245, "step": 9003 }, { "epoch": 2.8260033740044728, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.2379, "step": 9004 }, { "epoch": 2.8263172348856367, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 1.4051, "step": 9005 }, { "epoch": 2.8266310957668015, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 1.8193, "step": 9006 }, { "epoch": 2.826944956647966, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 1.6175, "step": 9007 }, { "epoch": 2.82725881752913, "grad_norm": 1.1640625, "learning_rate": 0.0002, "loss": 1.7663, "step": 9008 }, { "epoch": 2.8275726784102946, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 1.7074, "step": 9009 }, { "epoch": 2.827886539291459, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.7753, "step": 9010 }, { "epoch": 2.8282004001726238, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.559, "step": 9011 }, { "epoch": 2.8285142610537877, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 1.8392, "step": 9012 }, { "epoch": 2.8288281219349525, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 1.9288, "step": 9013 }, { "epoch": 2.829141982816117, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 1.8832, "step": 9014 }, { "epoch": 2.829455843697281, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 1.6077, "step": 9015 }, { "epoch": 2.8297697045784456, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 2.1331, "step": 9016 }, { "epoch": 2.83008356545961, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 1.5486, "step": 9017 }, { "epoch": 2.8303974263407743, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 1.58, "step": 9018 }, { "epoch": 2.8307112872219387, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 1.4177, "step": 9019 }, { "epoch": 2.8310251481031035, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 1.5542, "step": 9020 }, { "epoch": 2.831339008984268, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 1.5834, "step": 9021 }, { "epoch": 2.831652869865432, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 2.646, "step": 9022 }, { "epoch": 2.8319667307465966, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 1.8011, "step": 9023 }, { "epoch": 2.832280591627761, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.2585, "step": 9024 }, { "epoch": 2.8325944525089253, "grad_norm": 0.2275390625, "learning_rate": 0.0002, "loss": 1.2951, "step": 9025 }, { "epoch": 2.8329083133900896, "grad_norm": 0.228515625, "learning_rate": 0.0002, "loss": 1.3611, "step": 9026 }, { "epoch": 2.8332221742712544, "grad_norm": 0.2275390625, "learning_rate": 0.0002, "loss": 1.4556, "step": 9027 }, { "epoch": 2.833536035152419, "grad_norm": 0.248046875, "learning_rate": 0.0002, "loss": 1.2796, "step": 9028 }, { "epoch": 2.833849896033583, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.2206, "step": 9029 }, { "epoch": 2.8341637569147475, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.2956, "step": 9030 }, { "epoch": 2.834477617795912, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.2332, "step": 9031 }, { "epoch": 2.8347914786770763, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.3109, "step": 9032 }, { "epoch": 2.8351053395582406, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.2161, "step": 9033 }, { "epoch": 2.8354192004394054, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.3615, "step": 9034 }, { "epoch": 2.83573306132057, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.3017, "step": 9035 }, { "epoch": 2.836046922201734, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.167, "step": 9036 }, { "epoch": 2.8363607830828985, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.2524, "step": 9037 }, { "epoch": 2.836674643964063, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.1429, "step": 9038 }, { "epoch": 2.8369885048452272, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 1.3353, "step": 9039 }, { "epoch": 2.8373023657263916, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.1741, "step": 9040 }, { "epoch": 2.8376162266075564, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.2731, "step": 9041 }, { "epoch": 2.8379300874887208, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 1.155, "step": 9042 }, { "epoch": 2.838243948369885, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 1.2684, "step": 9043 }, { "epoch": 2.8385578092510495, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.2094, "step": 9044 }, { "epoch": 2.838871670132214, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 1.1623, "step": 9045 }, { "epoch": 2.8391855310133782, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 1.2455, "step": 9046 }, { "epoch": 2.8394993918945426, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 1.1499, "step": 9047 }, { "epoch": 2.8398132527757074, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 1.3241, "step": 9048 }, { "epoch": 2.8401271136568713, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.2392, "step": 9049 }, { "epoch": 2.840440974538036, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.347, "step": 9050 }, { "epoch": 2.8407548354192005, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 1.3251, "step": 9051 }, { "epoch": 2.841068696300365, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 1.0969, "step": 9052 }, { "epoch": 2.841382557181529, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 1.49, "step": 9053 }, { "epoch": 2.8416964180626936, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 1.3396, "step": 9054 }, { "epoch": 2.8420102789438584, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 1.5866, "step": 9055 }, { "epoch": 2.8423241398250223, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.6271, "step": 9056 }, { "epoch": 2.842638000706187, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 1.3321, "step": 9057 }, { "epoch": 2.8429518615873515, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 1.5362, "step": 9058 }, { "epoch": 2.843265722468516, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.414, "step": 9059 }, { "epoch": 2.84357958334968, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 1.3834, "step": 9060 }, { "epoch": 2.8438934442308446, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.6058, "step": 9061 }, { "epoch": 2.8442073051120094, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 1.3741, "step": 9062 }, { "epoch": 2.8445211659931733, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 2.5764, "step": 9063 }, { "epoch": 2.844835026874338, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.4562, "step": 9064 }, { "epoch": 2.8451488877555025, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 1.5226, "step": 9065 }, { "epoch": 2.845462748636667, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 1.4985, "step": 9066 }, { "epoch": 2.845776609517831, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 1.6727, "step": 9067 }, { "epoch": 2.8460904703989955, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 1.2927, "step": 9068 }, { "epoch": 2.84640433128016, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 1.5055, "step": 9069 }, { "epoch": 2.8467181921613243, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 1.4934, "step": 9070 }, { "epoch": 2.847032053042489, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 1.565, "step": 9071 }, { "epoch": 2.8473459139236534, "grad_norm": 1.125, "learning_rate": 0.0002, "loss": 2.0119, "step": 9072 }, { "epoch": 2.847659774804818, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 1.6404, "step": 9073 }, { "epoch": 2.847973635685982, "grad_norm": 0.2236328125, "learning_rate": 0.0002, "loss": 1.498, "step": 9074 }, { "epoch": 2.8482874965671465, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.295, "step": 9075 }, { "epoch": 2.848601357448311, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.448, "step": 9076 }, { "epoch": 2.8489152183294753, "grad_norm": 0.203125, "learning_rate": 0.0002, "loss": 1.2643, "step": 9077 }, { "epoch": 2.84922907921064, "grad_norm": 0.228515625, "learning_rate": 0.0002, "loss": 1.1701, "step": 9078 }, { "epoch": 2.8495429400918044, "grad_norm": 0.2412109375, "learning_rate": 0.0002, "loss": 1.3511, "step": 9079 }, { "epoch": 2.849856800972969, "grad_norm": 0.2470703125, "learning_rate": 0.0002, "loss": 1.2698, "step": 9080 }, { "epoch": 2.850170661854133, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.32, "step": 9081 }, { "epoch": 2.8504845227352975, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.1967, "step": 9082 }, { "epoch": 2.850798383616462, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.2424, "step": 9083 }, { "epoch": 2.8511122444976262, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.2262, "step": 9084 }, { "epoch": 2.851426105378791, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.2606, "step": 9085 }, { "epoch": 2.8517399662599554, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.2633, "step": 9086 }, { "epoch": 2.8520538271411198, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.1226, "step": 9087 }, { "epoch": 2.852367688022284, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 1.1934, "step": 9088 }, { "epoch": 2.8526815489034485, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 1.3545, "step": 9089 }, { "epoch": 2.852995409784613, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 1.2261, "step": 9090 }, { "epoch": 2.853309270665777, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 1.1806, "step": 9091 }, { "epoch": 2.853623131546942, "grad_norm": 0.91796875, "learning_rate": 0.0002, "loss": 1.3735, "step": 9092 }, { "epoch": 2.8539369924281064, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.224, "step": 9093 }, { "epoch": 2.8542508533092708, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.2, "step": 9094 }, { "epoch": 2.854564714190435, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 1.1722, "step": 9095 }, { "epoch": 2.8548785750715995, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 1.3908, "step": 9096 }, { "epoch": 2.855192435952764, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 1.1836, "step": 9097 }, { "epoch": 2.855506296833928, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 1.1041, "step": 9098 }, { "epoch": 2.855820157715093, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 1.4832, "step": 9099 }, { "epoch": 2.8561340185962574, "grad_norm": 1.3359375, "learning_rate": 0.0002, "loss": 1.6583, "step": 9100 }, { "epoch": 2.8564478794774217, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 1.3819, "step": 9101 }, { "epoch": 2.856761740358586, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 1.5399, "step": 9102 }, { "epoch": 2.8570756012397505, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 1.4292, "step": 9103 }, { "epoch": 2.857389462120915, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 1.4899, "step": 9104 }, { "epoch": 2.857703323002079, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 1.6549, "step": 9105 }, { "epoch": 2.858017183883244, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 1.6108, "step": 9106 }, { "epoch": 2.858331044764408, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 1.6387, "step": 9107 }, { "epoch": 2.8586449056455727, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 1.6155, "step": 9108 }, { "epoch": 2.858958766526737, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.5385, "step": 9109 }, { "epoch": 2.8592726274079014, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 1.5619, "step": 9110 }, { "epoch": 2.859586488289066, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 1.6163, "step": 9111 }, { "epoch": 2.85990034917023, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 1.7083, "step": 9112 }, { "epoch": 2.860214210051395, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 2.0019, "step": 9113 }, { "epoch": 2.860528070932559, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 1.7176, "step": 9114 }, { "epoch": 2.8608419318137237, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 1.7489, "step": 9115 }, { "epoch": 2.861155792694888, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 1.9442, "step": 9116 }, { "epoch": 2.8614696535760524, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 1.7019, "step": 9117 }, { "epoch": 2.861783514457217, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 1.6234, "step": 9118 }, { "epoch": 2.862097375338381, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 1.3765, "step": 9119 }, { "epoch": 2.862411236219546, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 1.532, "step": 9120 }, { "epoch": 2.86272509710071, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 1.594, "step": 9121 }, { "epoch": 2.8630389579818747, "grad_norm": 1.03125, "learning_rate": 0.0002, "loss": 2.1353, "step": 9122 }, { "epoch": 2.863352818863039, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 1.7767, "step": 9123 }, { "epoch": 2.8636666797442034, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.2067, "step": 9124 }, { "epoch": 2.8639805406253678, "grad_norm": 0.224609375, "learning_rate": 0.0002, "loss": 1.2977, "step": 9125 }, { "epoch": 2.864294401506532, "grad_norm": 0.2333984375, "learning_rate": 0.0002, "loss": 1.3069, "step": 9126 }, { "epoch": 2.8646082623876965, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 1.4639, "step": 9127 }, { "epoch": 2.864922123268861, "grad_norm": 0.2099609375, "learning_rate": 0.0002, "loss": 1.1895, "step": 9128 }, { "epoch": 2.8652359841500257, "grad_norm": 0.2099609375, "learning_rate": 0.0002, "loss": 1.3323, "step": 9129 }, { "epoch": 2.86554984503119, "grad_norm": 0.228515625, "learning_rate": 0.0002, "loss": 1.2823, "step": 9130 }, { "epoch": 2.8658637059123544, "grad_norm": 0.2255859375, "learning_rate": 0.0002, "loss": 1.2297, "step": 9131 }, { "epoch": 2.8661775667935188, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.3366, "step": 9132 }, { "epoch": 2.866491427674683, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.3351, "step": 9133 }, { "epoch": 2.8668052885558475, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.2682, "step": 9134 }, { "epoch": 2.867119149437012, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.3796, "step": 9135 }, { "epoch": 2.8674330103181767, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.1383, "step": 9136 }, { "epoch": 2.867746871199341, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.3909, "step": 9137 }, { "epoch": 2.8680607320805054, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.2909, "step": 9138 }, { "epoch": 2.8683745929616697, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.2686, "step": 9139 }, { "epoch": 2.868688453842834, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.1662, "step": 9140 }, { "epoch": 2.8690023147239985, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 1.0779, "step": 9141 }, { "epoch": 2.869316175605163, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 1.2465, "step": 9142 }, { "epoch": 2.8696300364863276, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.134, "step": 9143 }, { "epoch": 2.869943897367492, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 1.1521, "step": 9144 }, { "epoch": 2.8702577582486564, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.073, "step": 9145 }, { "epoch": 2.8705716191298207, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 1.252, "step": 9146 }, { "epoch": 2.870885480010985, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.1757, "step": 9147 }, { "epoch": 2.8711993408921495, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 1.2349, "step": 9148 }, { "epoch": 2.871513201773314, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 1.1728, "step": 9149 }, { "epoch": 2.8718270626544786, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 1.3679, "step": 9150 }, { "epoch": 2.872140923535643, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 1.4438, "step": 9151 }, { "epoch": 2.8724547844168073, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 1.2971, "step": 9152 }, { "epoch": 2.8727686452979717, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 1.3071, "step": 9153 }, { "epoch": 2.873082506179136, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.401, "step": 9154 }, { "epoch": 2.8733963670603004, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.4128, "step": 9155 }, { "epoch": 2.873710227941465, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 1.833, "step": 9156 }, { "epoch": 2.8740240888226296, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 1.4724, "step": 9157 }, { "epoch": 2.8743379497037935, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.3588, "step": 9158 }, { "epoch": 2.8746518105849583, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 1.8677, "step": 9159 }, { "epoch": 2.8749656714661227, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 1.2482, "step": 9160 }, { "epoch": 2.875279532347287, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 1.9113, "step": 9161 }, { "epoch": 2.8755933932284514, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 1.7697, "step": 9162 }, { "epoch": 2.875907254109616, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 1.7191, "step": 9163 }, { "epoch": 2.875907254109616, "eval_loss": 1.8761990070343018, "eval_runtime": 123.7901, "eval_samples_per_second": 8.078, "eval_steps_per_second": 8.078, "step": 9163 }, { "epoch": 2.875907254109616, "mmlu_eval_accuracy": 0.4258227348042516, "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, "mmlu_eval_accuracy_clinical_knowledge": 0.41379310344827586, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.36363636363636365, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.36363636363636365, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.4230769230769231, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.4375, "mmlu_eval_accuracy_elementary_mathematics": 0.21951219512195122, "mmlu_eval_accuracy_formal_logic": 0.35714285714285715, "mmlu_eval_accuracy_global_facts": 0.5, "mmlu_eval_accuracy_high_school_biology": 0.25, "mmlu_eval_accuracy_high_school_chemistry": 0.3181818181818182, "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556, "mmlu_eval_accuracy_high_school_european_history": 0.5555555555555556, "mmlu_eval_accuracy_high_school_geography": 0.5909090909090909, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5238095238095238, "mmlu_eval_accuracy_high_school_macroeconomics": 0.4883720930232558, "mmlu_eval_accuracy_high_school_mathematics": 0.20689655172413793, "mmlu_eval_accuracy_high_school_microeconomics": 0.38461538461538464, "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, "mmlu_eval_accuracy_high_school_psychology": 0.6333333333333333, "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, "mmlu_eval_accuracy_high_school_us_history": 0.5454545454545454, "mmlu_eval_accuracy_high_school_world_history": 0.5384615384615384, "mmlu_eval_accuracy_human_aging": 0.5217391304347826, "mmlu_eval_accuracy_human_sexuality": 0.3333333333333333, "mmlu_eval_accuracy_international_law": 0.7692307692307693, "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, "mmlu_eval_accuracy_logical_fallacies": 0.5, "mmlu_eval_accuracy_machine_learning": 0.45454545454545453, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.6, "mmlu_eval_accuracy_medical_genetics": 0.6363636363636364, "mmlu_eval_accuracy_miscellaneous": 0.5116279069767442, "mmlu_eval_accuracy_moral_disputes": 0.5526315789473685, "mmlu_eval_accuracy_moral_scenarios": 0.31, "mmlu_eval_accuracy_nutrition": 0.48484848484848486, "mmlu_eval_accuracy_philosophy": 0.35294117647058826, "mmlu_eval_accuracy_prehistory": 0.2857142857142857, "mmlu_eval_accuracy_professional_accounting": 0.25806451612903225, "mmlu_eval_accuracy_professional_law": 0.31176470588235294, "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, "mmlu_eval_accuracy_professional_psychology": 0.37681159420289856, "mmlu_eval_accuracy_public_relations": 0.5833333333333334, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.5454545454545454, "mmlu_eval_accuracy_us_foreign_policy": 0.6363636363636364, "mmlu_eval_accuracy_virology": 0.4444444444444444, "mmlu_eval_accuracy_world_religions": 0.6842105263157895, "mmlu_loss": 1.1657283430703709, "step": 9163 }, { "epoch": 2.8762211149907806, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.4422, "step": 9164 }, { "epoch": 2.8765349758719445, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 1.641, "step": 9165 }, { "epoch": 2.8768488367531093, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.5573, "step": 9166 }, { "epoch": 2.8771626976342737, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 1.5835, "step": 9167 }, { "epoch": 2.877476558515438, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 1.7923, "step": 9168 }, { "epoch": 2.8777904193966024, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 1.3367, "step": 9169 }, { "epoch": 2.8781042802777668, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 1.5132, "step": 9170 }, { "epoch": 2.8784181411589316, "grad_norm": 0.921875, "learning_rate": 0.0002, "loss": 1.8228, "step": 9171 }, { "epoch": 2.8787320020400955, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 2.0626, "step": 9172 }, { "epoch": 2.8790458629212603, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 1.5042, "step": 9173 }, { "epoch": 2.8793597238024247, "grad_norm": 0.19140625, "learning_rate": 0.0002, "loss": 1.2516, "step": 9174 }, { "epoch": 2.879673584683589, "grad_norm": 0.21484375, "learning_rate": 0.0002, "loss": 1.3063, "step": 9175 }, { "epoch": 2.8799874455647534, "grad_norm": 0.2490234375, "learning_rate": 0.0002, "loss": 1.4874, "step": 9176 }, { "epoch": 2.8803013064459178, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 1.2835, "step": 9177 }, { "epoch": 2.880615167327082, "grad_norm": 0.216796875, "learning_rate": 0.0002, "loss": 1.2245, "step": 9178 }, { "epoch": 2.8809290282082465, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 1.3496, "step": 9179 }, { "epoch": 2.8812428890894113, "grad_norm": 0.2197265625, "learning_rate": 0.0002, "loss": 1.1062, "step": 9180 }, { "epoch": 2.8815567499705756, "grad_norm": 0.2470703125, "learning_rate": 0.0002, "loss": 1.4002, "step": 9181 }, { "epoch": 2.88187061085174, "grad_norm": 0.2373046875, "learning_rate": 0.0002, "loss": 1.2155, "step": 9182 }, { "epoch": 2.8821844717329044, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.1771, "step": 9183 }, { "epoch": 2.8824983326140687, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.3097, "step": 9184 }, { "epoch": 2.882812193495233, "grad_norm": 0.2412109375, "learning_rate": 0.0002, "loss": 1.184, "step": 9185 }, { "epoch": 2.8831260543763975, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.2425, "step": 9186 }, { "epoch": 2.8834399152575623, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.2401, "step": 9187 }, { "epoch": 2.8837537761387266, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.1407, "step": 9188 }, { "epoch": 2.884067637019891, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.2738, "step": 9189 }, { "epoch": 2.8843814979010554, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.2728, "step": 9190 }, { "epoch": 2.8846953587822197, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.2418, "step": 9191 }, { "epoch": 2.885009219663384, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.2583, "step": 9192 }, { "epoch": 2.8853230805445484, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 1.2751, "step": 9193 }, { "epoch": 2.8856369414257133, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 1.1985, "step": 9194 }, { "epoch": 2.8859508023068776, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 1.2711, "step": 9195 }, { "epoch": 2.886264663188042, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 1.1077, "step": 9196 }, { "epoch": 2.8865785240692063, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 1.3773, "step": 9197 }, { "epoch": 2.8868923849503707, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 1.1897, "step": 9198 }, { "epoch": 2.887206245831535, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 1.1688, "step": 9199 }, { "epoch": 2.8875201067126994, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 1.5106, "step": 9200 }, { "epoch": 2.8878339675938642, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 1.3914, "step": 9201 }, { "epoch": 2.8881478284750286, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 1.5654, "step": 9202 }, { "epoch": 2.888461689356193, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.4306, "step": 9203 }, { "epoch": 2.8887755502373573, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 1.4539, "step": 9204 }, { "epoch": 2.8890894111185217, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 1.4266, "step": 9205 }, { "epoch": 2.889403271999686, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 1.6812, "step": 9206 }, { "epoch": 2.8897171328808504, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 1.5194, "step": 9207 }, { "epoch": 2.890030993762015, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 1.8071, "step": 9208 }, { "epoch": 2.890344854643179, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 1.7653, "step": 9209 }, { "epoch": 2.890658715524344, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.465, "step": 9210 }, { "epoch": 2.8909725764055083, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 1.6069, "step": 9211 }, { "epoch": 2.8912864372866727, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 2.0836, "step": 9212 }, { "epoch": 2.891600298167837, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 1.7322, "step": 9213 }, { "epoch": 2.8919141590490014, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 2.0488, "step": 9214 }, { "epoch": 2.892228019930166, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 1.703, "step": 9215 }, { "epoch": 2.89254188081133, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 1.9165, "step": 9216 }, { "epoch": 2.892855741692495, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 1.7885, "step": 9217 }, { "epoch": 2.8931696025736593, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.0853, "step": 9218 }, { "epoch": 2.8934834634548237, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.1929, "step": 9219 }, { "epoch": 2.893797324335988, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 1.7236, "step": 9220 }, { "epoch": 2.8941111852171524, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 1.7137, "step": 9221 }, { "epoch": 2.894425046098317, "grad_norm": 1.421875, "learning_rate": 0.0002, "loss": 2.7347, "step": 9222 }, { "epoch": 2.894738906979481, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.4389, "step": 9223 }, { "epoch": 2.895052767860646, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.2317, "step": 9224 }, { "epoch": 2.8953666287418103, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 1.3827, "step": 9225 }, { "epoch": 2.8956804896229746, "grad_norm": 0.2138671875, "learning_rate": 0.0002, "loss": 1.3125, "step": 9226 }, { "epoch": 2.895994350504139, "grad_norm": 0.2451171875, "learning_rate": 0.0002, "loss": 1.2708, "step": 9227 }, { "epoch": 2.8963082113853034, "grad_norm": 0.234375, "learning_rate": 0.0002, "loss": 1.3622, "step": 9228 }, { "epoch": 2.8966220722664677, "grad_norm": 0.244140625, "learning_rate": 0.0002, "loss": 1.2798, "step": 9229 }, { "epoch": 2.896935933147632, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.4568, "step": 9230 }, { "epoch": 2.897249794028797, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.3127, "step": 9231 }, { "epoch": 2.8975636549099613, "grad_norm": 0.23828125, "learning_rate": 0.0002, "loss": 1.186, "step": 9232 }, { "epoch": 2.8978775157911256, "grad_norm": 0.234375, "learning_rate": 0.0002, "loss": 1.044, "step": 9233 }, { "epoch": 2.89819137667229, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 1.2048, "step": 9234 }, { "epoch": 2.8985052375534544, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 1.2282, "step": 9235 }, { "epoch": 2.8988190984346187, "grad_norm": 0.244140625, "learning_rate": 0.0002, "loss": 1.066, "step": 9236 }, { "epoch": 2.899132959315783, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.078, "step": 9237 }, { "epoch": 2.899446820196948, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.2239, "step": 9238 }, { "epoch": 2.8997606810781122, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.2316, "step": 9239 }, { "epoch": 2.9000745419592766, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.2616, "step": 9240 }, { "epoch": 2.900388402840441, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 1.1115, "step": 9241 }, { "epoch": 2.9007022637216053, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.1179, "step": 9242 }, { "epoch": 2.9010161246027697, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 1.1528, "step": 9243 }, { "epoch": 2.901329985483934, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 1.2991, "step": 9244 }, { "epoch": 2.901643846365099, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 1.373, "step": 9245 }, { "epoch": 2.9019577072462632, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 1.2299, "step": 9246 }, { "epoch": 2.9022715681274276, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 1.1942, "step": 9247 }, { "epoch": 2.902585429008592, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.1646, "step": 9248 }, { "epoch": 2.9028992898897563, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 1.2777, "step": 9249 }, { "epoch": 2.9032131507709207, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 1.9132, "step": 9250 }, { "epoch": 2.903527011652085, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.4572, "step": 9251 }, { "epoch": 2.90384087253325, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 1.6071, "step": 9252 }, { "epoch": 2.904154733414414, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 1.6622, "step": 9253 }, { "epoch": 2.9044685942955786, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.5777, "step": 9254 }, { "epoch": 2.904782455176743, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 1.526, "step": 9255 }, { "epoch": 2.9050963160579073, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 1.5428, "step": 9256 }, { "epoch": 2.9054101769390717, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 1.6678, "step": 9257 }, { "epoch": 2.905724037820236, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 1.5418, "step": 9258 }, { "epoch": 2.906037898701401, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 1.4801, "step": 9259 }, { "epoch": 2.9063517595825648, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 1.6799, "step": 9260 }, { "epoch": 2.9066656204637296, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 1.9311, "step": 9261 }, { "epoch": 2.906979481344894, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 1.6379, "step": 9262 }, { "epoch": 2.9072933422260583, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 1.7348, "step": 9263 }, { "epoch": 2.9076072031072226, "grad_norm": 0.9375, "learning_rate": 0.0002, "loss": 1.82, "step": 9264 }, { "epoch": 2.907921063988387, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.7209, "step": 9265 }, { "epoch": 2.908234924869552, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 1.7559, "step": 9266 }, { "epoch": 2.9085487857507157, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 1.6945, "step": 9267 }, { "epoch": 2.9088626466318805, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.5186, "step": 9268 }, { "epoch": 2.909176507513045, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.473, "step": 9269 }, { "epoch": 2.9094903683942093, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 1.3865, "step": 9270 }, { "epoch": 2.9098042292753736, "grad_norm": 0.9375, "learning_rate": 0.0002, "loss": 1.6226, "step": 9271 }, { "epoch": 2.910118090156538, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 2.4095, "step": 9272 }, { "epoch": 2.910431951037703, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.6056, "step": 9273 }, { "epoch": 2.9107458119188667, "grad_norm": 0.212890625, "learning_rate": 0.0002, "loss": 1.3538, "step": 9274 }, { "epoch": 2.9110596728000315, "grad_norm": 0.197265625, "learning_rate": 0.0002, "loss": 1.3232, "step": 9275 }, { "epoch": 2.911373533681196, "grad_norm": 0.2060546875, "learning_rate": 0.0002, "loss": 1.2636, "step": 9276 }, { "epoch": 2.9116873945623603, "grad_norm": 0.2119140625, "learning_rate": 0.0002, "loss": 1.3079, "step": 9277 }, { "epoch": 2.9120012554435246, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.5118, "step": 9278 }, { "epoch": 2.912315116324689, "grad_norm": 0.24609375, "learning_rate": 0.0002, "loss": 1.3031, "step": 9279 }, { "epoch": 2.9126289772058533, "grad_norm": 0.2490234375, "learning_rate": 0.0002, "loss": 1.2654, "step": 9280 }, { "epoch": 2.9129428380870177, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.2706, "step": 9281 }, { "epoch": 2.9132566989681825, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.1949, "step": 9282 }, { "epoch": 2.913570559849347, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.2541, "step": 9283 }, { "epoch": 2.9138844207305112, "grad_norm": 0.2255859375, "learning_rate": 0.0002, "loss": 1.1771, "step": 9284 }, { "epoch": 2.9141982816116756, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.1466, "step": 9285 }, { "epoch": 2.91451214249284, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.2282, "step": 9286 }, { "epoch": 2.9148260033740043, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.2609, "step": 9287 }, { "epoch": 2.9151398642551687, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.278, "step": 9288 }, { "epoch": 2.9154537251363335, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.2025, "step": 9289 }, { "epoch": 2.915767586017498, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.2289, "step": 9290 }, { "epoch": 2.916081446898662, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.1323, "step": 9291 }, { "epoch": 2.9163953077798266, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.2033, "step": 9292 }, { "epoch": 2.916709168660991, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 1.2382, "step": 9293 }, { "epoch": 2.9170230295421553, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.0987, "step": 9294 }, { "epoch": 2.9173368904233197, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.061, "step": 9295 }, { "epoch": 2.9176507513044845, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 1.2109, "step": 9296 }, { "epoch": 2.917964612185649, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 1.183, "step": 9297 }, { "epoch": 2.918278473066813, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 1.156, "step": 9298 }, { "epoch": 2.9185923339479776, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 1.2748, "step": 9299 }, { "epoch": 2.918906194829142, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 1.2532, "step": 9300 }, { "epoch": 2.9192200557103063, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.3469, "step": 9301 }, { "epoch": 2.9195339165914707, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 1.2769, "step": 9302 }, { "epoch": 2.9198477774726355, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 1.285, "step": 9303 }, { "epoch": 2.9201616383538, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 1.3363, "step": 9304 }, { "epoch": 2.920475499234964, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 1.4428, "step": 9305 }, { "epoch": 2.9207893601161286, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 1.3417, "step": 9306 }, { "epoch": 2.921103220997293, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 1.7082, "step": 9307 }, { "epoch": 2.9214170818784573, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 1.8225, "step": 9308 }, { "epoch": 2.9217309427596216, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 1.6316, "step": 9309 }, { "epoch": 2.9220448036407864, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 1.712, "step": 9310 }, { "epoch": 2.922358664521951, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 1.8636, "step": 9311 }, { "epoch": 2.922672525403115, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 1.6578, "step": 9312 }, { "epoch": 2.9229863862842795, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 1.5501, "step": 9313 }, { "epoch": 2.923300247165444, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 1.8732, "step": 9314 }, { "epoch": 2.9236141080466083, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 1.702, "step": 9315 }, { "epoch": 2.9239279689277726, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 1.6833, "step": 9316 }, { "epoch": 2.9242418298089374, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 1.5025, "step": 9317 }, { "epoch": 2.9245556906901014, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 1.7514, "step": 9318 }, { "epoch": 2.924869551571266, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 1.518, "step": 9319 }, { "epoch": 2.9251834124524305, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 1.5729, "step": 9320 }, { "epoch": 2.925497273333595, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 1.6051, "step": 9321 }, { "epoch": 2.9258111342147592, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 1.9974, "step": 9322 }, { "epoch": 2.9261249950959236, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.5181, "step": 9323 }, { "epoch": 2.9264388559770884, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.2338, "step": 9324 }, { "epoch": 2.9267527168582523, "grad_norm": 0.2041015625, "learning_rate": 0.0002, "loss": 1.3301, "step": 9325 }, { "epoch": 2.927066577739417, "grad_norm": 0.1982421875, "learning_rate": 0.0002, "loss": 1.258, "step": 9326 }, { "epoch": 2.9273804386205815, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 1.3584, "step": 9327 }, { "epoch": 2.927694299501746, "grad_norm": 0.2412109375, "learning_rate": 0.0002, "loss": 1.4484, "step": 9328 }, { "epoch": 2.9280081603829102, "grad_norm": 0.2412109375, "learning_rate": 0.0002, "loss": 1.153, "step": 9329 }, { "epoch": 2.9283220212640746, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.2141, "step": 9330 }, { "epoch": 2.9286358821452394, "grad_norm": 0.244140625, "learning_rate": 0.0002, "loss": 1.2206, "step": 9331 }, { "epoch": 2.9289497430264033, "grad_norm": 0.24609375, "learning_rate": 0.0002, "loss": 1.1699, "step": 9332 }, { "epoch": 2.929263603907568, "grad_norm": 0.2392578125, "learning_rate": 0.0002, "loss": 1.2987, "step": 9333 }, { "epoch": 2.9295774647887325, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.1484, "step": 9334 }, { "epoch": 2.929891325669897, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.3435, "step": 9335 }, { "epoch": 2.930205186551061, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.2913, "step": 9336 }, { "epoch": 2.9305190474322256, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.2911, "step": 9337 }, { "epoch": 2.93083290831339, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.1074, "step": 9338 }, { "epoch": 2.9311467691945543, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.1217, "step": 9339 }, { "epoch": 2.931460630075719, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.2084, "step": 9340 }, { "epoch": 2.9317744909568835, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.1905, "step": 9341 }, { "epoch": 2.932088351838048, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.262, "step": 9342 }, { "epoch": 2.932402212719212, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.0687, "step": 9343 }, { "epoch": 2.9327160736003766, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.0989, "step": 9344 }, { "epoch": 2.933029934481541, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 1.0, "step": 9345 }, { "epoch": 2.9333437953627053, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 1.2588, "step": 9346 }, { "epoch": 2.93365765624387, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 1.5604, "step": 9347 }, { "epoch": 2.9339715171250345, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 1.2224, "step": 9348 }, { "epoch": 2.934285378006199, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 1.1948, "step": 9349 }, { "epoch": 2.934599238887363, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 1.3169, "step": 9350 }, { "epoch": 2.934599238887363, "eval_loss": 1.8553611040115356, "eval_runtime": 123.1225, "eval_samples_per_second": 8.122, "eval_steps_per_second": 8.122, "step": 9350 }, { "epoch": 2.934599238887363, "mmlu_eval_accuracy": 0.42224076268422145, "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, "mmlu_eval_accuracy_anatomy": 0.5714285714285714, "mmlu_eval_accuracy_astronomy": 0.4375, "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, "mmlu_eval_accuracy_clinical_knowledge": 0.4827586206896552, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.36363636363636365, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.36363636363636365, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.2727272727272727, "mmlu_eval_accuracy_conceptual_physics": 0.38461538461538464, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.25, "mmlu_eval_accuracy_elementary_mathematics": 0.24390243902439024, "mmlu_eval_accuracy_formal_logic": 0.21428571428571427, "mmlu_eval_accuracy_global_facts": 0.6, "mmlu_eval_accuracy_high_school_biology": 0.28125, "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365, "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556, "mmlu_eval_accuracy_high_school_european_history": 0.6666666666666666, "mmlu_eval_accuracy_high_school_geography": 0.5454545454545454, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5238095238095238, "mmlu_eval_accuracy_high_school_macroeconomics": 0.4883720930232558, "mmlu_eval_accuracy_high_school_mathematics": 0.2413793103448276, "mmlu_eval_accuracy_high_school_microeconomics": 0.38461538461538464, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.6666666666666666, "mmlu_eval_accuracy_high_school_statistics": 0.08695652173913043, "mmlu_eval_accuracy_high_school_us_history": 0.5909090909090909, "mmlu_eval_accuracy_high_school_world_history": 0.5384615384615384, "mmlu_eval_accuracy_human_aging": 0.5652173913043478, "mmlu_eval_accuracy_human_sexuality": 0.3333333333333333, "mmlu_eval_accuracy_international_law": 0.7692307692307693, "mmlu_eval_accuracy_jurisprudence": 0.45454545454545453, "mmlu_eval_accuracy_logical_fallacies": 0.5, "mmlu_eval_accuracy_machine_learning": 0.45454545454545453, "mmlu_eval_accuracy_management": 0.5454545454545454, "mmlu_eval_accuracy_marketing": 0.68, "mmlu_eval_accuracy_medical_genetics": 0.5454545454545454, "mmlu_eval_accuracy_miscellaneous": 0.5232558139534884, "mmlu_eval_accuracy_moral_disputes": 0.5263157894736842, "mmlu_eval_accuracy_moral_scenarios": 0.31, "mmlu_eval_accuracy_nutrition": 0.45454545454545453, "mmlu_eval_accuracy_philosophy": 0.4117647058823529, "mmlu_eval_accuracy_prehistory": 0.3142857142857143, "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, "mmlu_eval_accuracy_professional_law": 0.32941176470588235, "mmlu_eval_accuracy_professional_medicine": 0.3548387096774194, "mmlu_eval_accuracy_professional_psychology": 0.4057971014492754, "mmlu_eval_accuracy_public_relations": 0.5, "mmlu_eval_accuracy_security_studies": 0.4074074074074074, "mmlu_eval_accuracy_sociology": 0.45454545454545453, "mmlu_eval_accuracy_us_foreign_policy": 0.5454545454545454, "mmlu_eval_accuracy_virology": 0.4444444444444444, "mmlu_eval_accuracy_world_religions": 0.631578947368421, "mmlu_loss": 1.4160843094644167, "step": 9350 }, { "epoch": 2.9349130997685275, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 1.2188, "step": 9351 }, { "epoch": 2.935226960649692, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 1.4218, "step": 9352 }, { "epoch": 2.9355408215308563, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.4033, "step": 9353 }, { "epoch": 2.935854682412021, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.2627, "step": 9354 }, { "epoch": 2.9361685432931854, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 1.6191, "step": 9355 }, { "epoch": 2.93648240417435, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 1.6385, "step": 9356 }, { "epoch": 2.936796265055514, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 1.4936, "step": 9357 }, { "epoch": 2.9371101259366785, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 1.459, "step": 9358 }, { "epoch": 2.937423986817843, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 1.5441, "step": 9359 }, { "epoch": 2.9377378476990073, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 1.6919, "step": 9360 }, { "epoch": 2.938051708580172, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 1.44, "step": 9361 }, { "epoch": 2.9383655694613364, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 1.5979, "step": 9362 }, { "epoch": 2.938679430342501, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 1.8018, "step": 9363 }, { "epoch": 2.938993291223665, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 1.6639, "step": 9364 }, { "epoch": 2.9393071521048295, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 1.8697, "step": 9365 }, { "epoch": 2.939621012985994, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 1.6695, "step": 9366 }, { "epoch": 2.9399348738671582, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 2.0399, "step": 9367 }, { "epoch": 2.940248734748323, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 1.5309, "step": 9368 }, { "epoch": 2.940562595629487, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 1.8864, "step": 9369 }, { "epoch": 2.9408764565106518, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 1.7542, "step": 9370 }, { "epoch": 2.941190317391816, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 1.7105, "step": 9371 }, { "epoch": 2.9415041782729805, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 1.9176, "step": 9372 }, { "epoch": 2.941818039154145, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 1.8492, "step": 9373 }, { "epoch": 2.9421319000353092, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.2629, "step": 9374 }, { "epoch": 2.942445760916474, "grad_norm": 0.2314453125, "learning_rate": 0.0002, "loss": 1.2557, "step": 9375 }, { "epoch": 2.942759621797638, "grad_norm": 0.216796875, "learning_rate": 0.0002, "loss": 1.2919, "step": 9376 }, { "epoch": 2.9430734826788028, "grad_norm": 0.2021484375, "learning_rate": 0.0002, "loss": 1.132, "step": 9377 }, { "epoch": 2.943387343559967, "grad_norm": 0.22265625, "learning_rate": 0.0002, "loss": 1.4452, "step": 9378 }, { "epoch": 2.9437012044411315, "grad_norm": 0.22265625, "learning_rate": 0.0002, "loss": 1.1625, "step": 9379 }, { "epoch": 2.944015065322296, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.3731, "step": 9380 }, { "epoch": 2.94432892620346, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.2291, "step": 9381 }, { "epoch": 2.944642787084625, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.3691, "step": 9382 }, { "epoch": 2.944956647965789, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.168, "step": 9383 }, { "epoch": 2.9452705088469537, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.3094, "step": 9384 }, { "epoch": 2.945584369728118, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 1.394, "step": 9385 }, { "epoch": 2.9458982306092825, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.1918, "step": 9386 }, { "epoch": 2.946212091490447, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.165, "step": 9387 }, { "epoch": 2.946525952371611, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 1.2, "step": 9388 }, { "epoch": 2.9468398132527756, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.2335, "step": 9389 }, { "epoch": 2.94715367413394, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.1117, "step": 9390 }, { "epoch": 2.9474675350151047, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 1.2898, "step": 9391 }, { "epoch": 2.947781395896269, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.1785, "step": 9392 }, { "epoch": 2.9480952567774334, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.4124, "step": 9393 }, { "epoch": 2.948409117658598, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.0698, "step": 9394 }, { "epoch": 2.948722978539762, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 1.0283, "step": 9395 }, { "epoch": 2.9490368394209265, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 1.2019, "step": 9396 }, { "epoch": 2.949350700302091, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 1.3432, "step": 9397 }, { "epoch": 2.9496645611832557, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 1.2265, "step": 9398 }, { "epoch": 2.94997842206442, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.3992, "step": 9399 }, { "epoch": 2.9502922829455844, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 1.4437, "step": 9400 }, { "epoch": 2.950606143826749, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 1.156, "step": 9401 }, { "epoch": 2.950920004707913, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 1.3625, "step": 9402 }, { "epoch": 2.9512338655890775, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 1.3376, "step": 9403 }, { "epoch": 2.951547726470242, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.12, "step": 9404 }, { "epoch": 2.9518615873514067, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 1.3309, "step": 9405 }, { "epoch": 2.952175448232571, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.3755, "step": 9406 }, { "epoch": 2.9524893091137354, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 1.5538, "step": 9407 }, { "epoch": 2.9528031699949, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 1.2567, "step": 9408 }, { "epoch": 2.953117030876064, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 1.7727, "step": 9409 }, { "epoch": 2.9534308917572285, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 1.6904, "step": 9410 }, { "epoch": 2.953744752638393, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 1.7954, "step": 9411 }, { "epoch": 2.9540586135195577, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 1.6599, "step": 9412 }, { "epoch": 2.954372474400722, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 1.9667, "step": 9413 }, { "epoch": 2.9546863352818864, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 1.517, "step": 9414 }, { "epoch": 2.9550001961630508, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 1.6352, "step": 9415 }, { "epoch": 2.955314057044215, "grad_norm": 0.9140625, "learning_rate": 0.0002, "loss": 1.8427, "step": 9416 }, { "epoch": 2.9556279179253795, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 1.7782, "step": 9417 }, { "epoch": 2.955941778806544, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 1.5864, "step": 9418 }, { "epoch": 2.9562556396877087, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 1.3267, "step": 9419 }, { "epoch": 2.9565695005688726, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 1.7013, "step": 9420 }, { "epoch": 2.9568833614500374, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.4806, "step": 9421 }, { "epoch": 2.9571972223312017, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 1.9826, "step": 9422 }, { "epoch": 2.957511083212366, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 1.759, "step": 9423 }, { "epoch": 2.9578249440935305, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.2318, "step": 9424 }, { "epoch": 2.958138804974695, "grad_norm": 0.2255859375, "learning_rate": 0.0002, "loss": 1.3547, "step": 9425 }, { "epoch": 2.9584526658558596, "grad_norm": 0.2158203125, "learning_rate": 0.0002, "loss": 1.2914, "step": 9426 }, { "epoch": 2.9587665267370236, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.3337, "step": 9427 }, { "epoch": 2.9590803876181884, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.377, "step": 9428 }, { "epoch": 2.9593942484993527, "grad_norm": 0.244140625, "learning_rate": 0.0002, "loss": 1.1936, "step": 9429 }, { "epoch": 2.959708109380517, "grad_norm": 0.244140625, "learning_rate": 0.0002, "loss": 1.3915, "step": 9430 }, { "epoch": 2.9600219702616815, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.2573, "step": 9431 }, { "epoch": 2.960335831142846, "grad_norm": 0.2470703125, "learning_rate": 0.0002, "loss": 1.1949, "step": 9432 }, { "epoch": 2.9606496920240106, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.3062, "step": 9433 }, { "epoch": 2.9609635529051745, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 1.2476, "step": 9434 }, { "epoch": 2.9612774137863394, "grad_norm": 0.248046875, "learning_rate": 0.0002, "loss": 1.0846, "step": 9435 }, { "epoch": 2.9615912746675037, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.1504, "step": 9436 }, { "epoch": 2.961905135548668, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.1983, "step": 9437 }, { "epoch": 2.9622189964298324, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.1937, "step": 9438 }, { "epoch": 2.962532857310997, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.2957, "step": 9439 }, { "epoch": 2.962846718192161, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 1.2687, "step": 9440 }, { "epoch": 2.9631605790733255, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 1.0766, "step": 9441 }, { "epoch": 2.9634744399544903, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 1.1046, "step": 9442 }, { "epoch": 2.9637883008356547, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 1.1553, "step": 9443 }, { "epoch": 2.964102161716819, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 1.2688, "step": 9444 }, { "epoch": 2.9644160225979834, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 1.2691, "step": 9445 }, { "epoch": 2.964729883479148, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 1.2599, "step": 9446 }, { "epoch": 2.965043744360312, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 1.377, "step": 9447 }, { "epoch": 2.9653576052414765, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 1.2108, "step": 9448 }, { "epoch": 2.9656714661226413, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 1.2736, "step": 9449 }, { "epoch": 2.9659853270038057, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 1.2695, "step": 9450 }, { "epoch": 2.96629918788497, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.2906, "step": 9451 }, { "epoch": 2.9666130487661344, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 1.5703, "step": 9452 }, { "epoch": 2.9669269096472988, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 1.4633, "step": 9453 }, { "epoch": 2.967240770528463, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 1.4221, "step": 9454 }, { "epoch": 2.9675546314096275, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 1.7478, "step": 9455 }, { "epoch": 2.9678684922907923, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 1.4688, "step": 9456 }, { "epoch": 2.9681823531719567, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 1.7941, "step": 9457 }, { "epoch": 2.968496214053121, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 1.7128, "step": 9458 }, { "epoch": 2.9688100749342854, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 1.8195, "step": 9459 }, { "epoch": 2.9691239358154498, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 1.8326, "step": 9460 }, { "epoch": 2.969437796696614, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 1.5712, "step": 9461 }, { "epoch": 2.9697516575777785, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.4856, "step": 9462 }, { "epoch": 2.9700655184589433, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 1.9067, "step": 9463 }, { "epoch": 2.9703793793401077, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 1.6307, "step": 9464 }, { "epoch": 2.970693240221272, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 1.3992, "step": 9465 }, { "epoch": 2.9710071011024364, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 1.7564, "step": 9466 }, { "epoch": 2.9713209619836007, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 1.8826, "step": 9467 }, { "epoch": 2.971634822864765, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 1.495, "step": 9468 }, { "epoch": 2.9719486837459295, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 1.5836, "step": 9469 }, { "epoch": 2.9722625446270943, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 1.5534, "step": 9470 }, { "epoch": 2.972576405508258, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 1.8829, "step": 9471 }, { "epoch": 2.972890266389423, "grad_norm": 0.91015625, "learning_rate": 0.0002, "loss": 2.3762, "step": 9472 }, { "epoch": 2.9732041272705874, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 1.553, "step": 9473 }, { "epoch": 2.9735179881517517, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.3238, "step": 9474 }, { "epoch": 2.973831849032916, "grad_norm": 0.2177734375, "learning_rate": 0.0002, "loss": 1.4552, "step": 9475 }, { "epoch": 2.9741457099140804, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.3197, "step": 9476 }, { "epoch": 2.9744595707952453, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.3568, "step": 9477 }, { "epoch": 2.974773431676409, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 1.2488, "step": 9478 }, { "epoch": 2.975087292557574, "grad_norm": 0.2333984375, "learning_rate": 0.0002, "loss": 1.3929, "step": 9479 }, { "epoch": 2.9754011534387383, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.2632, "step": 9480 }, { "epoch": 2.9757150143199027, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.3085, "step": 9481 }, { "epoch": 2.976028875201067, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.2421, "step": 9482 }, { "epoch": 2.9763427360822314, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.2425, "step": 9483 }, { "epoch": 2.9766565969633962, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 1.2138, "step": 9484 }, { "epoch": 2.97697045784456, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 1.1889, "step": 9485 }, { "epoch": 2.977284318725725, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.2086, "step": 9486 }, { "epoch": 2.9775981796068893, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.2376, "step": 9487 }, { "epoch": 2.9779120404880537, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.2716, "step": 9488 }, { "epoch": 2.978225901369218, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.2318, "step": 9489 }, { "epoch": 2.9785397622503824, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.1857, "step": 9490 }, { "epoch": 2.978853623131547, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.1519, "step": 9491 }, { "epoch": 2.979167484012711, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 1.1038, "step": 9492 }, { "epoch": 2.979481344893876, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 1.1719, "step": 9493 }, { "epoch": 2.9797952057750403, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.2695, "step": 9494 }, { "epoch": 2.9801090666562047, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 1.1702, "step": 9495 }, { "epoch": 2.980422927537369, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 1.2703, "step": 9496 }, { "epoch": 2.9807367884185334, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 1.1476, "step": 9497 }, { "epoch": 2.9810506492996978, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.1776, "step": 9498 }, { "epoch": 2.981364510180862, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 1.187, "step": 9499 }, { "epoch": 2.981678371062027, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.1321, "step": 9500 }, { "epoch": 2.9819922319431913, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.3621, "step": 9501 }, { "epoch": 2.9823060928243557, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.4042, "step": 9502 }, { "epoch": 2.98261995370552, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.4612, "step": 9503 }, { "epoch": 2.9829338145866844, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 1.3821, "step": 9504 }, { "epoch": 2.9832476754678487, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 1.4627, "step": 9505 }, { "epoch": 2.983561536349013, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 1.4459, "step": 9506 }, { "epoch": 2.983875397230178, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 1.7426, "step": 9507 }, { "epoch": 2.9841892581113423, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 1.5772, "step": 9508 }, { "epoch": 2.9845031189925066, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 1.611, "step": 9509 }, { "epoch": 2.984816979873671, "grad_norm": 0.97265625, "learning_rate": 0.0002, "loss": 1.9711, "step": 9510 }, { "epoch": 2.9851308407548354, "grad_norm": 1.03125, "learning_rate": 0.0002, "loss": 2.0818, "step": 9511 }, { "epoch": 2.9854447016359997, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 1.6855, "step": 9512 }, { "epoch": 2.985758562517164, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.5575, "step": 9513 }, { "epoch": 2.986072423398329, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 1.5207, "step": 9514 }, { "epoch": 2.9863862842794933, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 1.6288, "step": 9515 }, { "epoch": 2.9867001451606576, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 1.7329, "step": 9516 }, { "epoch": 2.987014006041822, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.2674, "step": 9517 }, { "epoch": 2.9873278669229864, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 1.6206, "step": 9518 }, { "epoch": 2.9876417278041507, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 1.7021, "step": 9519 }, { "epoch": 2.987955588685315, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 1.5577, "step": 9520 }, { "epoch": 2.98826944956648, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 1.1903, "step": 9521 }, { "epoch": 2.9885833104476442, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 2.6006, "step": 9522 }, { "epoch": 2.9888971713288086, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 1.5539, "step": 9523 }, { "epoch": 2.989211032209973, "grad_norm": 0.2109375, "learning_rate": 0.0002, "loss": 1.3249, "step": 9524 }, { "epoch": 2.9895248930911373, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.3134, "step": 9525 }, { "epoch": 2.9898387539723017, "grad_norm": 0.248046875, "learning_rate": 0.0002, "loss": 1.2997, "step": 9526 }, { "epoch": 2.990152614853466, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.2315, "step": 9527 }, { "epoch": 2.990466475734631, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.2112, "step": 9528 }, { "epoch": 2.990780336615795, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 1.2611, "step": 9529 }, { "epoch": 2.9910941974969596, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.3462, "step": 9530 }, { "epoch": 2.991408058378124, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.2274, "step": 9531 }, { "epoch": 2.9917219192592883, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.2384, "step": 9532 }, { "epoch": 2.9920357801404527, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.3161, "step": 9533 }, { "epoch": 2.992349641021617, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 1.1396, "step": 9534 }, { "epoch": 2.992663501902782, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.1765, "step": 9535 }, { "epoch": 2.9929773627839458, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.2897, "step": 9536 }, { "epoch": 2.9932912236651106, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 1.1747, "step": 9537 }, { "epoch": 2.9932912236651106, "eval_loss": 1.8602570295333862, "eval_runtime": 123.0153, "eval_samples_per_second": 8.129, "eval_steps_per_second": 8.129, "step": 9537 }, { "epoch": 2.9932912236651106, "mmlu_eval_accuracy": 0.42444306629488804, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.5, "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, "mmlu_eval_accuracy_clinical_knowledge": 0.4482758620689655, "mmlu_eval_accuracy_college_biology": 0.25, "mmlu_eval_accuracy_college_chemistry": 0.375, "mmlu_eval_accuracy_college_computer_science": 0.36363636363636365, "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, "mmlu_eval_accuracy_college_medicine": 0.36363636363636365, "mmlu_eval_accuracy_college_physics": 0.5454545454545454, "mmlu_eval_accuracy_computer_security": 0.18181818181818182, "mmlu_eval_accuracy_conceptual_physics": 0.46153846153846156, "mmlu_eval_accuracy_econometrics": 0.08333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.375, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.21428571428571427, "mmlu_eval_accuracy_global_facts": 0.6, "mmlu_eval_accuracy_high_school_biology": 0.28125, "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365, "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, "mmlu_eval_accuracy_high_school_european_history": 0.6111111111111112, "mmlu_eval_accuracy_high_school_geography": 0.5909090909090909, "mmlu_eval_accuracy_high_school_government_and_politics": 0.5238095238095238, "mmlu_eval_accuracy_high_school_macroeconomics": 0.5116279069767442, "mmlu_eval_accuracy_high_school_mathematics": 0.1724137931034483, "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, "mmlu_eval_accuracy_high_school_psychology": 0.6, "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, "mmlu_eval_accuracy_high_school_us_history": 0.5, "mmlu_eval_accuracy_high_school_world_history": 0.46153846153846156, "mmlu_eval_accuracy_human_aging": 0.43478260869565216, "mmlu_eval_accuracy_human_sexuality": 0.3333333333333333, "mmlu_eval_accuracy_international_law": 0.7692307692307693, "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, "mmlu_eval_accuracy_logical_fallacies": 0.5, "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, "mmlu_eval_accuracy_management": 0.5454545454545454, "mmlu_eval_accuracy_marketing": 0.72, "mmlu_eval_accuracy_medical_genetics": 0.6363636363636364, "mmlu_eval_accuracy_miscellaneous": 0.5116279069767442, "mmlu_eval_accuracy_moral_disputes": 0.4473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.23, "mmlu_eval_accuracy_nutrition": 0.48484848484848486, "mmlu_eval_accuracy_philosophy": 0.4411764705882353, "mmlu_eval_accuracy_prehistory": 0.34285714285714286, "mmlu_eval_accuracy_professional_accounting": 0.25806451612903225, "mmlu_eval_accuracy_professional_law": 0.31176470588235294, "mmlu_eval_accuracy_professional_medicine": 0.3870967741935484, "mmlu_eval_accuracy_professional_psychology": 0.42028985507246375, "mmlu_eval_accuracy_public_relations": 0.5, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.6363636363636364, "mmlu_eval_accuracy_us_foreign_policy": 0.6363636363636364, "mmlu_eval_accuracy_virology": 0.2777777777777778, "mmlu_eval_accuracy_world_religions": 0.5789473684210527, "mmlu_loss": 1.2572945500416821, "step": 9537 }, { "epoch": 2.993605084546275, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 1.0661, "step": 9538 }, { "epoch": 2.9939189454274393, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.1755, "step": 9539 }, { "epoch": 2.9942328063086037, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.1012, "step": 9540 }, { "epoch": 2.994546667189768, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 1.0115, "step": 9541 }, { "epoch": 2.9948605280709324, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 1.3731, "step": 9542 }, { "epoch": 2.9951743889520968, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.1223, "step": 9543 }, { "epoch": 2.9954882498332616, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 1.6787, "step": 9544 }, { "epoch": 2.995802110714426, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 1.7123, "step": 9545 }, { "epoch": 2.9961159715955903, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 1.5949, "step": 9546 }, { "epoch": 2.9964298324767547, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 1.7979, "step": 9547 }, { "epoch": 2.996743693357919, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 1.9313, "step": 9548 }, { "epoch": 2.9970575542390834, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.3465, "step": 9549 }, { "epoch": 2.9973714151202477, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 1.2209, "step": 9550 }, { "epoch": 2.9976852760014125, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 1.3464, "step": 9551 }, { "epoch": 2.997999136882577, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 1.8226, "step": 9552 }, { "epoch": 2.9983129977637413, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 1.2924, "step": 9553 }, { "epoch": 2.9986268586449056, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 1.7133, "step": 9554 }, { "epoch": 2.99894071952607, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 1.7165, "step": 9555 }, { "epoch": 2.9992545804072344, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 1.6035, "step": 9556 }, { "epoch": 2.9995684412883987, "grad_norm": 0.8828125, "learning_rate": 0.0002, "loss": 1.8044, "step": 9557 }, { "epoch": 2.9998823021695635, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 1.9065, "step": 9558 }, { "epoch": 2.9998823021695635, "step": 9558, "total_flos": 5.173921474210529e+17, "train_loss": 1.616389752793796, "train_runtime": 177951.3432, "train_samples_per_second": 0.859, "train_steps_per_second": 0.054 } ], "logging_steps": 1, "max_steps": 9558, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.173921474210529e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }