{ "best_metric": 0.9417692129092176, "best_model_checkpoint": "Crosswalk/dinov2/checkpoint-924", "epoch": 22.0, "eval_steps": 500, "global_step": 924, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.24242424242424243, "grad_norm": 1809.6781005859375, "learning_rate": 9.70873786407767e-07, "loss": 4.7087, "step": 10 }, { "epoch": 0.48484848484848486, "grad_norm": 190.5601806640625, "learning_rate": 1.941747572815534e-06, "loss": 3.034, "step": 20 }, { "epoch": 0.7272727272727273, "grad_norm": 76.29146575927734, "learning_rate": 2.912621359223301e-06, "loss": 2.0024, "step": 30 }, { "epoch": 0.9696969696969697, "grad_norm": 75.1009750366211, "learning_rate": 3.883495145631068e-06, "loss": 1.4019, "step": 40 }, { "epoch": 1.0, "eval_loss": 0.2978227138519287, "eval_macro_f1": 0.8815037150933147, "eval_runtime": 7.5247, "eval_samples_per_second": 43.856, "eval_steps_per_second": 5.582, "step": 42 }, { "epoch": 1.1939393939393939, "grad_norm": 86.90376281738281, "learning_rate": 4.854368932038836e-06, "loss": 0.7179, "step": 50 }, { "epoch": 1.4363636363636363, "grad_norm": 118.144287109375, "learning_rate": 5.825242718446602e-06, "loss": 0.9737, "step": 60 }, { "epoch": 1.6787878787878787, "grad_norm": 174.987548828125, "learning_rate": 6.79611650485437e-06, "loss": 1.3976, "step": 70 }, { "epoch": 1.9212121212121214, "grad_norm": 35.86643981933594, "learning_rate": 7.766990291262136e-06, "loss": 0.967, "step": 80 }, { "epoch": 2.0, "eval_loss": 0.22084859013557434, "eval_macro_f1": 0.9229339286881953, "eval_runtime": 5.8751, "eval_samples_per_second": 56.169, "eval_steps_per_second": 7.149, "step": 84 }, { "epoch": 2.1454545454545455, "grad_norm": 232.57371520996094, "learning_rate": 8.737864077669904e-06, "loss": 1.323, "step": 90 }, { "epoch": 2.3878787878787877, "grad_norm": 72.19599914550781, "learning_rate": 9.708737864077671e-06, "loss": 1.005, "step": 100 }, { "epoch": 2.6303030303030304, "grad_norm": 55.63515853881836, "learning_rate": 9.924078091106291e-06, "loss": 0.6989, "step": 110 }, { "epoch": 2.8727272727272726, "grad_norm": 56.631591796875, "learning_rate": 9.815618221258135e-06, "loss": 0.7527, "step": 120 }, { "epoch": 3.0, "eval_loss": 0.3025018870830536, "eval_macro_f1": 0.9009343690194753, "eval_runtime": 6.0811, "eval_samples_per_second": 54.266, "eval_steps_per_second": 6.907, "step": 126 }, { "epoch": 3.096969696969697, "grad_norm": 40.3350715637207, "learning_rate": 9.70715835140998e-06, "loss": 0.6221, "step": 130 }, { "epoch": 3.3393939393939394, "grad_norm": 95.85454559326172, "learning_rate": 9.598698481561823e-06, "loss": 0.9071, "step": 140 }, { "epoch": 3.581818181818182, "grad_norm": 94.18701934814453, "learning_rate": 9.490238611713667e-06, "loss": 0.7042, "step": 150 }, { "epoch": 3.824242424242424, "grad_norm": 477.8069763183594, "learning_rate": 9.38177874186551e-06, "loss": 0.635, "step": 160 }, { "epoch": 4.0, "eval_loss": 0.22365985810756683, "eval_macro_f1": 0.9049918736939866, "eval_runtime": 5.9294, "eval_samples_per_second": 55.655, "eval_steps_per_second": 7.083, "step": 168 }, { "epoch": 4.048484848484849, "grad_norm": 110.27919006347656, "learning_rate": 9.273318872017354e-06, "loss": 1.1381, "step": 170 }, { "epoch": 4.290909090909091, "grad_norm": 58.735958099365234, "learning_rate": 9.1648590021692e-06, "loss": 0.7225, "step": 180 }, { "epoch": 4.533333333333333, "grad_norm": 75.20926666259766, "learning_rate": 9.056399132321042e-06, "loss": 0.4634, "step": 190 }, { "epoch": 4.775757575757575, "grad_norm": 18.876136779785156, "learning_rate": 8.947939262472886e-06, "loss": 0.6293, "step": 200 }, { "epoch": 5.0, "grad_norm": 8.281109809875488, "learning_rate": 8.83947939262473e-06, "loss": 0.6632, "step": 210 }, { "epoch": 5.0, "eval_loss": 0.2299780696630478, "eval_macro_f1": 0.9176304185040354, "eval_runtime": 6.0281, "eval_samples_per_second": 54.744, "eval_steps_per_second": 6.967, "step": 210 }, { "epoch": 5.242424242424242, "grad_norm": 13.756321907043457, "learning_rate": 8.731019522776574e-06, "loss": 0.4708, "step": 220 }, { "epoch": 5.484848484848484, "grad_norm": 59.22605895996094, "learning_rate": 8.622559652928418e-06, "loss": 0.7127, "step": 230 }, { "epoch": 5.7272727272727275, "grad_norm": 32.43043899536133, "learning_rate": 8.514099783080262e-06, "loss": 0.5682, "step": 240 }, { "epoch": 5.96969696969697, "grad_norm": 54.722599029541016, "learning_rate": 8.405639913232104e-06, "loss": 0.8667, "step": 250 }, { "epoch": 6.0, "eval_loss": 0.2767850160598755, "eval_macro_f1": 0.9210700618192522, "eval_runtime": 6.2004, "eval_samples_per_second": 53.223, "eval_steps_per_second": 6.774, "step": 252 }, { "epoch": 6.193939393939394, "grad_norm": 18.175823211669922, "learning_rate": 8.29718004338395e-06, "loss": 0.6752, "step": 260 }, { "epoch": 6.4363636363636365, "grad_norm": 56.679988861083984, "learning_rate": 8.188720173535792e-06, "loss": 0.3727, "step": 270 }, { "epoch": 6.678787878787879, "grad_norm": 57.3917236328125, "learning_rate": 8.080260303687636e-06, "loss": 1.0167, "step": 280 }, { "epoch": 6.921212121212121, "grad_norm": 42.186038970947266, "learning_rate": 7.97180043383948e-06, "loss": 0.9377, "step": 290 }, { "epoch": 7.0, "eval_loss": 0.29274508357048035, "eval_macro_f1": 0.9138863000931967, "eval_runtime": 6.1213, "eval_samples_per_second": 53.91, "eval_steps_per_second": 6.861, "step": 294 }, { "epoch": 7.1454545454545455, "grad_norm": 41.31782531738281, "learning_rate": 7.863340563991324e-06, "loss": 0.3818, "step": 300 }, { "epoch": 7.387878787878788, "grad_norm": 4.223178863525391, "learning_rate": 7.754880694143168e-06, "loss": 0.4503, "step": 310 }, { "epoch": 7.63030303030303, "grad_norm": 35.64258575439453, "learning_rate": 7.646420824295012e-06, "loss": 0.6038, "step": 320 }, { "epoch": 7.872727272727273, "grad_norm": 37.91206359863281, "learning_rate": 7.537960954446856e-06, "loss": 0.5407, "step": 330 }, { "epoch": 8.0, "eval_loss": 0.20143219828605652, "eval_macro_f1": 0.9357970705676355, "eval_runtime": 5.9715, "eval_samples_per_second": 55.263, "eval_steps_per_second": 7.033, "step": 336 }, { "epoch": 8.096969696969698, "grad_norm": 9.571391105651855, "learning_rate": 7.429501084598699e-06, "loss": 0.3311, "step": 340 }, { "epoch": 8.33939393939394, "grad_norm": 30.14655876159668, "learning_rate": 7.321041214750543e-06, "loss": 0.5367, "step": 350 }, { "epoch": 8.581818181818182, "grad_norm": 125.38350677490234, "learning_rate": 7.212581344902386e-06, "loss": 0.4511, "step": 360 }, { "epoch": 8.824242424242424, "grad_norm": 283.20819091796875, "learning_rate": 7.104121475054231e-06, "loss": 0.5474, "step": 370 }, { "epoch": 9.0, "eval_loss": 0.329227477312088, "eval_macro_f1": 0.8817302125547928, "eval_runtime": 5.984, "eval_samples_per_second": 55.147, "eval_steps_per_second": 7.019, "step": 378 }, { "epoch": 9.048484848484849, "grad_norm": 31.927379608154297, "learning_rate": 6.995661605206075e-06, "loss": 0.3963, "step": 380 }, { "epoch": 9.290909090909091, "grad_norm": 10.10098934173584, "learning_rate": 6.887201735357918e-06, "loss": 0.445, "step": 390 }, { "epoch": 9.533333333333333, "grad_norm": 1.947770118713379, "learning_rate": 6.778741865509761e-06, "loss": 0.5947, "step": 400 }, { "epoch": 9.775757575757575, "grad_norm": 54.11802673339844, "learning_rate": 6.670281995661606e-06, "loss": 0.6001, "step": 410 }, { "epoch": 10.0, "grad_norm": 0.004518165718764067, "learning_rate": 6.56182212581345e-06, "loss": 0.412, "step": 420 }, { "epoch": 10.0, "eval_loss": 0.3594599962234497, "eval_macro_f1": 0.907735321528425, "eval_runtime": 6.0125, "eval_samples_per_second": 54.885, "eval_steps_per_second": 6.985, "step": 420 }, { "epoch": 10.242424242424242, "grad_norm": 0.22337216138839722, "learning_rate": 6.453362255965293e-06, "loss": 0.2798, "step": 430 }, { "epoch": 10.484848484848484, "grad_norm": 39.639984130859375, "learning_rate": 6.344902386117138e-06, "loss": 0.4377, "step": 440 }, { "epoch": 10.727272727272727, "grad_norm": 53.85198211669922, "learning_rate": 6.236442516268981e-06, "loss": 0.2063, "step": 450 }, { "epoch": 10.969696969696969, "grad_norm": 69.08942413330078, "learning_rate": 6.127982646420825e-06, "loss": 0.2884, "step": 460 }, { "epoch": 11.0, "eval_loss": 0.2930862307548523, "eval_macro_f1": 0.9380839806371721, "eval_runtime": 6.0147, "eval_samples_per_second": 54.866, "eval_steps_per_second": 6.983, "step": 462 }, { "epoch": 11.193939393939393, "grad_norm": 63.143218994140625, "learning_rate": 6.019522776572668e-06, "loss": 0.6075, "step": 470 }, { "epoch": 11.436363636363636, "grad_norm": 7.950187683105469, "learning_rate": 5.911062906724513e-06, "loss": 0.2654, "step": 480 }, { "epoch": 11.67878787878788, "grad_norm": 82.30758666992188, "learning_rate": 5.802603036876356e-06, "loss": 0.2474, "step": 490 }, { "epoch": 11.921212121212122, "grad_norm": 7.340689182281494, "learning_rate": 5.6941431670282e-06, "loss": 0.2405, "step": 500 }, { "epoch": 12.0, "eval_loss": 0.3316686451435089, "eval_macro_f1": 0.9209216589861751, "eval_runtime": 5.8916, "eval_samples_per_second": 56.012, "eval_steps_per_second": 7.129, "step": 504 }, { "epoch": 12.145454545454545, "grad_norm": 37.141632080078125, "learning_rate": 5.585683297180043e-06, "loss": 0.3349, "step": 510 }, { "epoch": 12.387878787878789, "grad_norm": 34.024383544921875, "learning_rate": 5.477223427331888e-06, "loss": 0.1742, "step": 520 }, { "epoch": 12.63030303030303, "grad_norm": 46.10781478881836, "learning_rate": 5.368763557483731e-06, "loss": 0.2115, "step": 530 }, { "epoch": 12.872727272727273, "grad_norm": 126.67015838623047, "learning_rate": 5.260303687635575e-06, "loss": 0.8788, "step": 540 }, { "epoch": 13.0, "eval_loss": 0.37741926312446594, "eval_macro_f1": 0.9058106453305834, "eval_runtime": 6.6329, "eval_samples_per_second": 49.752, "eval_steps_per_second": 6.332, "step": 546 }, { "epoch": 13.096969696969698, "grad_norm": 35.44662094116211, "learning_rate": 5.151843817787418e-06, "loss": 0.5591, "step": 550 }, { "epoch": 13.33939393939394, "grad_norm": 57.34544372558594, "learning_rate": 5.043383947939263e-06, "loss": 0.213, "step": 560 }, { "epoch": 13.581818181818182, "grad_norm": 15.285223960876465, "learning_rate": 4.934924078091107e-06, "loss": 0.203, "step": 570 }, { "epoch": 13.824242424242424, "grad_norm": 28.99003028869629, "learning_rate": 4.82646420824295e-06, "loss": 0.4163, "step": 580 }, { "epoch": 14.0, "eval_loss": 0.39865490794181824, "eval_macro_f1": 0.9196508840275697, "eval_runtime": 5.8701, "eval_samples_per_second": 56.217, "eval_steps_per_second": 7.155, "step": 588 }, { "epoch": 14.048484848484849, "grad_norm": 346.8255310058594, "learning_rate": 4.718004338394794e-06, "loss": 0.26, "step": 590 }, { "epoch": 14.290909090909091, "grad_norm": 38.04654312133789, "learning_rate": 4.609544468546638e-06, "loss": 0.3813, "step": 600 }, { "epoch": 14.533333333333333, "grad_norm": 34.71643829345703, "learning_rate": 4.501084598698482e-06, "loss": 0.0974, "step": 610 }, { "epoch": 14.775757575757575, "grad_norm": 34.031890869140625, "learning_rate": 4.392624728850326e-06, "loss": 0.4881, "step": 620 }, { "epoch": 15.0, "grad_norm": 0.0002771662548184395, "learning_rate": 4.284164859002169e-06, "loss": 0.4126, "step": 630 }, { "epoch": 15.0, "eval_loss": 0.35451531410217285, "eval_macro_f1": 0.9235679411519468, "eval_runtime": 6.0428, "eval_samples_per_second": 54.611, "eval_steps_per_second": 6.95, "step": 630 }, { "epoch": 15.242424242424242, "grad_norm": 82.48748779296875, "learning_rate": 4.175704989154013e-06, "loss": 0.4444, "step": 640 }, { "epoch": 15.484848484848484, "grad_norm": 0.2618753910064697, "learning_rate": 4.067245119305857e-06, "loss": 0.2083, "step": 650 }, { "epoch": 15.727272727272727, "grad_norm": 98.34405517578125, "learning_rate": 3.958785249457701e-06, "loss": 0.4785, "step": 660 }, { "epoch": 15.969696969696969, "grad_norm": 0.37142229080200195, "learning_rate": 3.8503253796095445e-06, "loss": 0.1583, "step": 670 }, { "epoch": 16.0, "eval_loss": 0.38117873668670654, "eval_macro_f1": 0.9268860086407444, "eval_runtime": 6.9311, "eval_samples_per_second": 47.612, "eval_steps_per_second": 6.06, "step": 672 }, { "epoch": 16.193939393939395, "grad_norm": 40.54330825805664, "learning_rate": 3.741865509761389e-06, "loss": 0.0774, "step": 680 }, { "epoch": 16.436363636363637, "grad_norm": 0.23675695061683655, "learning_rate": 3.6334056399132324e-06, "loss": 0.1639, "step": 690 }, { "epoch": 16.67878787878788, "grad_norm": 47.12529373168945, "learning_rate": 3.5249457700650764e-06, "loss": 0.306, "step": 700 }, { "epoch": 16.921212121212122, "grad_norm": 0.3993530571460724, "learning_rate": 3.41648590021692e-06, "loss": 0.2376, "step": 710 }, { "epoch": 17.0, "eval_loss": 0.4087267816066742, "eval_macro_f1": 0.9295990205081115, "eval_runtime": 6.1306, "eval_samples_per_second": 53.828, "eval_steps_per_second": 6.851, "step": 714 }, { "epoch": 17.145454545454545, "grad_norm": 0.34205177426338196, "learning_rate": 3.308026030368764e-06, "loss": 0.0332, "step": 720 }, { "epoch": 17.387878787878787, "grad_norm": 0.5112647414207458, "learning_rate": 3.1995661605206075e-06, "loss": 0.1332, "step": 730 }, { "epoch": 17.63030303030303, "grad_norm": 120.2950439453125, "learning_rate": 3.0911062906724515e-06, "loss": 0.2503, "step": 740 }, { "epoch": 17.87272727272727, "grad_norm": 223.04759216308594, "learning_rate": 2.982646420824295e-06, "loss": 0.2703, "step": 750 }, { "epoch": 18.0, "eval_loss": 0.43362897634506226, "eval_macro_f1": 0.9264924264924266, "eval_runtime": 5.861, "eval_samples_per_second": 56.305, "eval_steps_per_second": 7.166, "step": 756 }, { "epoch": 18.096969696969698, "grad_norm": 54.66193771362305, "learning_rate": 2.874186550976139e-06, "loss": 0.1274, "step": 760 }, { "epoch": 18.33939393939394, "grad_norm": 54.846466064453125, "learning_rate": 2.765726681127983e-06, "loss": 0.2751, "step": 770 }, { "epoch": 18.581818181818182, "grad_norm": 53.97863006591797, "learning_rate": 2.6572668112798266e-06, "loss": 0.359, "step": 780 }, { "epoch": 18.824242424242424, "grad_norm": 81.63549041748047, "learning_rate": 2.5488069414316706e-06, "loss": 0.1819, "step": 790 }, { "epoch": 19.0, "eval_loss": 0.3480012118816376, "eval_macro_f1": 0.9236528192931639, "eval_runtime": 7.0471, "eval_samples_per_second": 46.828, "eval_steps_per_second": 5.96, "step": 798 }, { "epoch": 19.048484848484847, "grad_norm": 41.54087448120117, "learning_rate": 2.440347071583514e-06, "loss": 0.6373, "step": 800 }, { "epoch": 19.29090909090909, "grad_norm": 9.001028060913086, "learning_rate": 2.331887201735358e-06, "loss": 0.2971, "step": 810 }, { "epoch": 19.533333333333335, "grad_norm": 114.6279525756836, "learning_rate": 2.2234273318872017e-06, "loss": 0.1943, "step": 820 }, { "epoch": 19.775757575757577, "grad_norm": 18.022676467895508, "learning_rate": 2.1149674620390457e-06, "loss": 0.1207, "step": 830 }, { "epoch": 20.0, "grad_norm": 0.0001882202341221273, "learning_rate": 2.0065075921908892e-06, "loss": 0.1324, "step": 840 }, { "epoch": 20.0, "eval_loss": 0.4493299424648285, "eval_macro_f1": 0.9384902143522833, "eval_runtime": 6.0147, "eval_samples_per_second": 54.865, "eval_steps_per_second": 6.983, "step": 840 }, { "epoch": 20.242424242424242, "grad_norm": 13.740226745605469, "learning_rate": 1.8980477223427332e-06, "loss": 0.294, "step": 850 }, { "epoch": 20.484848484848484, "grad_norm": 6.870513916015625, "learning_rate": 1.7895878524945772e-06, "loss": 0.1323, "step": 860 }, { "epoch": 20.727272727272727, "grad_norm": 2.74729585647583, "learning_rate": 1.681127982646421e-06, "loss": 0.019, "step": 870 }, { "epoch": 20.96969696969697, "grad_norm": 17.18338966369629, "learning_rate": 1.572668112798265e-06, "loss": 0.1312, "step": 880 }, { "epoch": 21.0, "eval_loss": 0.40448498725891113, "eval_macro_f1": 0.9384902143522833, "eval_runtime": 6.133, "eval_samples_per_second": 53.808, "eval_steps_per_second": 6.848, "step": 882 }, { "epoch": 21.193939393939395, "grad_norm": 108.9833755493164, "learning_rate": 1.4642082429501087e-06, "loss": 0.2499, "step": 890 }, { "epoch": 21.436363636363637, "grad_norm": 0.291847825050354, "learning_rate": 1.3557483731019525e-06, "loss": 0.1708, "step": 900 }, { "epoch": 21.67878787878788, "grad_norm": 35.08168029785156, "learning_rate": 1.2472885032537963e-06, "loss": 0.0802, "step": 910 }, { "epoch": 21.921212121212122, "grad_norm": 0.05401836335659027, "learning_rate": 1.13882863340564e-06, "loss": 0.1662, "step": 920 }, { "epoch": 22.0, "eval_loss": 0.3166828453540802, "eval_macro_f1": 0.9417692129092176, "eval_runtime": 6.0442, "eval_samples_per_second": 54.598, "eval_steps_per_second": 6.949, "step": 924 } ], "logging_steps": 10, "max_steps": 1025, "num_input_tokens_seen": 0, "num_train_epochs": 25, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0371596050603966e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }