{ "best_metric": 0.466439425945282, "best_model_checkpoint": "/home/datawork-iot-nos/Seatizen/models/multilabel/drone/drone-DinoVdeau-from-binary-large-2024_11_14-batch-size16_freeze_probs/checkpoint-22776", "epoch": 62.0, "eval_steps": 500, "global_step": 27156, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "eval_explained_variance": 0.3631434440612793, "eval_kl_divergence": 0.421912282705307, "eval_loss": 0.4821413457393646, "eval_mae": 0.13084472715854645, "eval_rmse": 0.173090398311615, "eval_runtime": 64.2475, "eval_samples_per_second": 36.64, "eval_steps_per_second": 2.304, "learning_rate": 0.001, "step": 438 }, { "epoch": 1.1415525114155252, "grad_norm": 0.4757365882396698, "learning_rate": 0.001, "loss": 0.5317, "step": 500 }, { "epoch": 2.0, "eval_explained_variance": 0.3752269744873047, "eval_kl_divergence": 0.6148446202278137, "eval_loss": 0.4784533977508545, "eval_mae": 0.12629373371601105, "eval_rmse": 0.17098082602024078, "eval_runtime": 63.6833, "eval_samples_per_second": 36.964, "eval_steps_per_second": 2.324, "learning_rate": 0.001, "step": 876 }, { "epoch": 2.2831050228310503, "grad_norm": 0.4254082143306732, "learning_rate": 0.001, "loss": 0.4832, "step": 1000 }, { "epoch": 3.0, "eval_explained_variance": 0.3838556706905365, "eval_kl_divergence": 0.48802560567855835, "eval_loss": 0.47776785492897034, "eval_mae": 0.12731628119945526, "eval_rmse": 0.16985835134983063, "eval_runtime": 62.637, "eval_samples_per_second": 37.582, "eval_steps_per_second": 2.363, "learning_rate": 0.001, "step": 1314 }, { "epoch": 3.4246575342465753, "grad_norm": 0.2670271098613739, "learning_rate": 0.001, "loss": 0.4791, "step": 1500 }, { "epoch": 4.0, "eval_explained_variance": 0.38376739621162415, "eval_kl_divergence": 0.3418101370334625, "eval_loss": 0.4793245792388916, "eval_mae": 0.12901858985424042, "eval_rmse": 0.171015664935112, "eval_runtime": 62.2828, "eval_samples_per_second": 37.795, "eval_steps_per_second": 2.376, "learning_rate": 0.001, "step": 1752 }, { "epoch": 4.566210045662101, "grad_norm": 0.20498104393482208, "learning_rate": 0.001, "loss": 0.4771, "step": 2000 }, { "epoch": 5.0, "eval_explained_variance": 0.40547776222229004, "eval_kl_divergence": 0.34562820196151733, "eval_loss": 0.47521594166755676, "eval_mae": 0.12799377739429474, "eval_rmse": 0.16736441850662231, "eval_runtime": 62.7606, "eval_samples_per_second": 37.508, "eval_steps_per_second": 2.358, "learning_rate": 0.001, "step": 2190 }, { "epoch": 5.707762557077626, "grad_norm": 0.24335043132305145, "learning_rate": 0.001, "loss": 0.4752, "step": 2500 }, { "epoch": 6.0, "eval_explained_variance": 0.3849389851093292, "eval_kl_divergence": 0.6402714848518372, "eval_loss": 0.478865385055542, "eval_mae": 0.12540282309055328, "eval_rmse": 0.17068879306316376, "eval_runtime": 63.4836, "eval_samples_per_second": 37.08, "eval_steps_per_second": 2.331, "learning_rate": 0.001, "step": 2628 }, { "epoch": 6.8493150684931505, "grad_norm": 0.18768365681171417, "learning_rate": 0.001, "loss": 0.4752, "step": 3000 }, { "epoch": 7.0, "eval_explained_variance": 0.3788411617279053, "eval_kl_divergence": 0.5491646528244019, "eval_loss": 0.4779475927352905, "eval_mae": 0.12878474593162537, "eval_rmse": 0.17091502249240875, "eval_runtime": 63.4904, "eval_samples_per_second": 37.076, "eval_steps_per_second": 2.331, "learning_rate": 0.001, "step": 3066 }, { "epoch": 7.9908675799086755, "grad_norm": 0.1587909311056137, "learning_rate": 0.001, "loss": 0.4735, "step": 3500 }, { "epoch": 8.0, "eval_explained_variance": 0.40857037901878357, "eval_kl_divergence": 0.33827269077301025, "eval_loss": 0.4756968021392822, "eval_mae": 0.12695902585983276, "eval_rmse": 0.16784566640853882, "eval_runtime": 64.8792, "eval_samples_per_second": 36.283, "eval_steps_per_second": 2.281, "learning_rate": 0.001, "step": 3504 }, { "epoch": 9.0, "eval_explained_variance": 0.4111548960208893, "eval_kl_divergence": 0.5417521595954895, "eval_loss": 0.4731782376766205, "eval_mae": 0.12311580032110214, "eval_rmse": 0.1657222956418991, "eval_runtime": 61.0673, "eval_samples_per_second": 38.548, "eval_steps_per_second": 2.424, "learning_rate": 0.001, "step": 3942 }, { "epoch": 9.132420091324201, "grad_norm": 0.1892658919095993, "learning_rate": 0.001, "loss": 0.4719, "step": 4000 }, { "epoch": 10.0, "eval_explained_variance": 0.4006313979625702, "eval_kl_divergence": 0.15472176671028137, "eval_loss": 0.4799855649471283, "eval_mae": 0.1320570707321167, "eval_rmse": 0.1722680777311325, "eval_runtime": 62.7974, "eval_samples_per_second": 37.486, "eval_steps_per_second": 2.357, "learning_rate": 0.001, "step": 4380 }, { "epoch": 10.273972602739725, "grad_norm": 0.20271532237529755, "learning_rate": 0.001, "loss": 0.4727, "step": 4500 }, { "epoch": 11.0, "eval_explained_variance": 0.41580215096473694, "eval_kl_divergence": 0.3436921238899231, "eval_loss": 0.4731641411781311, "eval_mae": 0.12562014162540436, "eval_rmse": 0.16564464569091797, "eval_runtime": 62.757, "eval_samples_per_second": 37.51, "eval_steps_per_second": 2.358, "learning_rate": 0.001, "step": 4818 }, { "epoch": 11.415525114155251, "grad_norm": 0.15571434795856476, "learning_rate": 0.001, "loss": 0.4723, "step": 5000 }, { "epoch": 12.0, "eval_explained_variance": 0.401115745306015, "eval_kl_divergence": 0.2946830093860626, "eval_loss": 0.47767141461372375, "eval_mae": 0.12927968800067902, "eval_rmse": 0.17007046937942505, "eval_runtime": 62.3684, "eval_samples_per_second": 37.743, "eval_steps_per_second": 2.373, "learning_rate": 0.001, "step": 5256 }, { "epoch": 12.557077625570777, "grad_norm": 0.16043365001678467, "learning_rate": 0.001, "loss": 0.4735, "step": 5500 }, { "epoch": 13.0, "eval_explained_variance": 0.39704158902168274, "eval_kl_divergence": 0.6136478781700134, "eval_loss": 0.48009705543518066, "eval_mae": 0.12475714087486267, "eval_rmse": 0.16773907840251923, "eval_runtime": 64.7917, "eval_samples_per_second": 36.332, "eval_steps_per_second": 2.284, "learning_rate": 0.001, "step": 5694 }, { "epoch": 13.698630136986301, "grad_norm": 0.13616104423999786, "learning_rate": 0.001, "loss": 0.4728, "step": 6000 }, { "epoch": 14.0, "eval_explained_variance": 0.40355002880096436, "eval_kl_divergence": Infinity, "eval_loss": 0.4954195022583008, "eval_mae": 0.12534154951572418, "eval_rmse": 0.16692323982715607, "eval_runtime": 62.1801, "eval_samples_per_second": 37.858, "eval_steps_per_second": 2.38, "learning_rate": 0.001, "step": 6132 }, { "epoch": 14.840182648401827, "grad_norm": 0.12133222818374634, "learning_rate": 0.001, "loss": 0.4713, "step": 6500 }, { "epoch": 15.0, "eval_explained_variance": 0.4051372706890106, "eval_kl_divergence": Infinity, "eval_loss": 0.4812238812446594, "eval_mae": 0.12540575861930847, "eval_rmse": 0.16624794900417328, "eval_runtime": 61.3206, "eval_samples_per_second": 38.388, "eval_steps_per_second": 2.414, "learning_rate": 0.001, "step": 6570 }, { "epoch": 15.981735159817351, "grad_norm": 0.11760278791189194, "learning_rate": 0.0001, "loss": 0.4706, "step": 7000 }, { "epoch": 16.0, "eval_explained_variance": 0.41243478655815125, "eval_kl_divergence": Infinity, "eval_loss": 0.4858487546443939, "eval_mae": 0.12432911992073059, "eval_rmse": 0.16562338173389435, "eval_runtime": 61.1501, "eval_samples_per_second": 38.495, "eval_steps_per_second": 2.42, "learning_rate": 0.0001, "step": 7008 }, { "epoch": 17.0, "eval_explained_variance": 0.4291960895061493, "eval_kl_divergence": 0.41650328040122986, "eval_loss": 0.47084349393844604, "eval_mae": 0.12233477830886841, "eval_rmse": 0.162751242518425, "eval_runtime": 60.5736, "eval_samples_per_second": 38.862, "eval_steps_per_second": 2.443, "learning_rate": 0.0001, "step": 7446 }, { "epoch": 17.123287671232877, "grad_norm": 0.13284093141555786, "learning_rate": 0.0001, "loss": 0.4672, "step": 7500 }, { "epoch": 18.0, "eval_explained_variance": 0.43114474415779114, "eval_kl_divergence": 0.4066373407840729, "eval_loss": 0.4707622528076172, "eval_mae": 0.12164173275232315, "eval_rmse": 0.16261519491672516, "eval_runtime": 63.1248, "eval_samples_per_second": 37.291, "eval_steps_per_second": 2.345, "learning_rate": 0.0001, "step": 7884 }, { "epoch": 18.264840182648403, "grad_norm": 0.11859569698572159, "learning_rate": 0.0001, "loss": 0.4659, "step": 8000 }, { "epoch": 19.0, "eval_explained_variance": 0.4342735707759857, "eval_kl_divergence": 0.31854644417762756, "eval_loss": 0.47095733880996704, "eval_mae": 0.12272538989782333, "eval_rmse": 0.16323107481002808, "eval_runtime": 61.7089, "eval_samples_per_second": 38.147, "eval_steps_per_second": 2.398, "learning_rate": 0.0001, "step": 8322 }, { "epoch": 19.40639269406393, "grad_norm": 0.16951066255569458, "learning_rate": 0.0001, "loss": 0.4653, "step": 8500 }, { "epoch": 20.0, "eval_explained_variance": 0.43487218022346497, "eval_kl_divergence": 0.465139240026474, "eval_loss": 0.4696938395500183, "eval_mae": 0.12050192803144455, "eval_rmse": 0.1620241105556488, "eval_runtime": 61.0162, "eval_samples_per_second": 38.58, "eval_steps_per_second": 2.426, "learning_rate": 0.0001, "step": 8760 }, { "epoch": 20.54794520547945, "grad_norm": 0.167369082570076, "learning_rate": 0.0001, "loss": 0.4653, "step": 9000 }, { "epoch": 21.0, "eval_explained_variance": 0.4389828145503998, "eval_kl_divergence": 0.3772728741168976, "eval_loss": 0.46922874450683594, "eval_mae": 0.12155676633119583, "eval_rmse": 0.16139467060565948, "eval_runtime": 62.17, "eval_samples_per_second": 37.864, "eval_steps_per_second": 2.381, "learning_rate": 0.0001, "step": 9198 }, { "epoch": 21.689497716894977, "grad_norm": 0.1247042864561081, "learning_rate": 0.0001, "loss": 0.4659, "step": 9500 }, { "epoch": 22.0, "eval_explained_variance": 0.43975934386253357, "eval_kl_divergence": 0.4611187279224396, "eval_loss": 0.4685634672641754, "eval_mae": 0.1203194335103035, "eval_rmse": 0.16088876128196716, "eval_runtime": 62.0052, "eval_samples_per_second": 37.965, "eval_steps_per_second": 2.387, "learning_rate": 0.0001, "step": 9636 }, { "epoch": 22.831050228310502, "grad_norm": 0.16208066046237946, "learning_rate": 0.0001, "loss": 0.465, "step": 10000 }, { "epoch": 23.0, "eval_explained_variance": 0.44279029965400696, "eval_kl_divergence": 0.24986685812473297, "eval_loss": 0.47018975019454956, "eval_mae": 0.12256480008363724, "eval_rmse": 0.16208301484584808, "eval_runtime": 61.6543, "eval_samples_per_second": 38.181, "eval_steps_per_second": 2.4, "learning_rate": 0.0001, "step": 10074 }, { "epoch": 23.972602739726028, "grad_norm": 0.17417912185192108, "learning_rate": 0.0001, "loss": 0.4633, "step": 10500 }, { "epoch": 24.0, "eval_explained_variance": 0.4367590844631195, "eval_kl_divergence": 0.3702172040939331, "eval_loss": 0.4705464243888855, "eval_mae": 0.12131566554307938, "eval_rmse": 0.16277877986431122, "eval_runtime": 62.8273, "eval_samples_per_second": 37.468, "eval_steps_per_second": 2.356, "learning_rate": 0.0001, "step": 10512 }, { "epoch": 25.0, "eval_explained_variance": 0.4433206617832184, "eval_kl_divergence": 0.5132729411125183, "eval_loss": 0.4678299129009247, "eval_mae": 0.11875440925359726, "eval_rmse": 0.16013289988040924, "eval_runtime": 61.7077, "eval_samples_per_second": 38.148, "eval_steps_per_second": 2.398, "learning_rate": 0.0001, "step": 10950 }, { "epoch": 25.114155251141554, "grad_norm": 0.13617579638957977, "learning_rate": 0.0001, "loss": 0.4656, "step": 11000 }, { "epoch": 26.0, "eval_explained_variance": 0.4423791468143463, "eval_kl_divergence": 0.5665323138237, "eval_loss": 0.46802961826324463, "eval_mae": 0.117874376475811, "eval_rmse": 0.1604483276605606, "eval_runtime": 61.9639, "eval_samples_per_second": 37.99, "eval_steps_per_second": 2.388, "learning_rate": 0.0001, "step": 11388 }, { "epoch": 26.255707762557076, "grad_norm": 0.15818916261196136, "learning_rate": 0.0001, "loss": 0.4629, "step": 11500 }, { "epoch": 27.0, "eval_explained_variance": 0.4434410333633423, "eval_kl_divergence": 0.42424070835113525, "eval_loss": 0.4680938124656677, "eval_mae": 0.1199984923005104, "eval_rmse": 0.16038183867931366, "eval_runtime": 62.3144, "eval_samples_per_second": 37.776, "eval_steps_per_second": 2.375, "learning_rate": 0.0001, "step": 11826 }, { "epoch": 27.397260273972602, "grad_norm": 0.15971983969211578, "learning_rate": 0.0001, "loss": 0.4636, "step": 12000 }, { "epoch": 28.0, "eval_explained_variance": 0.44512465596199036, "eval_kl_divergence": 0.2967982292175293, "eval_loss": 0.4693257212638855, "eval_mae": 0.12149528414011002, "eval_rmse": 0.1616295725107193, "eval_runtime": 66.789, "eval_samples_per_second": 35.245, "eval_steps_per_second": 2.216, "learning_rate": 0.0001, "step": 12264 }, { "epoch": 28.538812785388128, "grad_norm": 0.15448875725269318, "learning_rate": 0.0001, "loss": 0.4633, "step": 12500 }, { "epoch": 29.0, "eval_explained_variance": 0.4442717730998993, "eval_kl_divergence": 0.3924856185913086, "eval_loss": 0.46847742795944214, "eval_mae": 0.1196620985865593, "eval_rmse": 0.16072382032871246, "eval_runtime": 61.9086, "eval_samples_per_second": 38.024, "eval_steps_per_second": 2.391, "learning_rate": 0.0001, "step": 12702 }, { "epoch": 29.680365296803654, "grad_norm": 0.15532433986663818, "learning_rate": 0.0001, "loss": 0.4631, "step": 13000 }, { "epoch": 30.0, "eval_explained_variance": 0.4473068416118622, "eval_kl_divergence": 0.2495478093624115, "eval_loss": 0.46944141387939453, "eval_mae": 0.12209376692771912, "eval_rmse": 0.16142255067825317, "eval_runtime": 62.4285, "eval_samples_per_second": 37.707, "eval_steps_per_second": 2.371, "learning_rate": 0.0001, "step": 13140 }, { "epoch": 30.82191780821918, "grad_norm": 0.1961052566766739, "learning_rate": 0.0001, "loss": 0.463, "step": 13500 }, { "epoch": 31.0, "eval_explained_variance": 0.4445982277393341, "eval_kl_divergence": 0.45099732279777527, "eval_loss": 0.4678958058357239, "eval_mae": 0.11854251474142075, "eval_rmse": 0.16011421382427216, "eval_runtime": 61.3729, "eval_samples_per_second": 38.356, "eval_steps_per_second": 2.411, "learning_rate": 0.0001, "step": 13578 }, { "epoch": 31.963470319634702, "grad_norm": 0.3346303701400757, "learning_rate": 1e-05, "loss": 0.4623, "step": 14000 }, { "epoch": 32.0, "eval_explained_variance": 0.4478188455104828, "eval_kl_divergence": 0.3885524570941925, "eval_loss": 0.46778997778892517, "eval_mae": 0.11933697015047073, "eval_rmse": 0.16006481647491455, "eval_runtime": 63.8544, "eval_samples_per_second": 36.865, "eval_steps_per_second": 2.318, "learning_rate": 1e-05, "step": 14016 }, { "epoch": 33.0, "eval_explained_variance": 0.44756200909614563, "eval_kl_divergence": 0.31322383880615234, "eval_loss": 0.4686955511569977, "eval_mae": 0.1201881393790245, "eval_rmse": 0.16055406630039215, "eval_runtime": 62.7334, "eval_samples_per_second": 37.524, "eval_steps_per_second": 2.359, "learning_rate": 1e-05, "step": 14454 }, { "epoch": 33.10502283105023, "grad_norm": 0.21087272465229034, "learning_rate": 1e-05, "loss": 0.4621, "step": 14500 }, { "epoch": 34.0, "eval_explained_variance": 0.4478868544101715, "eval_kl_divergence": 0.3957745432853699, "eval_loss": 0.46784329414367676, "eval_mae": 0.11951460689306259, "eval_rmse": 0.1600986272096634, "eval_runtime": 60.6174, "eval_samples_per_second": 38.834, "eval_steps_per_second": 2.442, "learning_rate": 1e-05, "step": 14892 }, { "epoch": 34.24657534246575, "grad_norm": 0.1875353455543518, "learning_rate": 1e-05, "loss": 0.4607, "step": 15000 }, { "epoch": 35.0, "eval_explained_variance": 0.44849491119384766, "eval_kl_divergence": 0.45786312222480774, "eval_loss": 0.4671097695827484, "eval_mae": 0.11800643056631088, "eval_rmse": 0.15947793424129486, "eval_runtime": 61.7609, "eval_samples_per_second": 38.115, "eval_steps_per_second": 2.396, "learning_rate": 1e-05, "step": 15330 }, { "epoch": 35.38812785388128, "grad_norm": 0.16752338409423828, "learning_rate": 1e-05, "loss": 0.4619, "step": 15500 }, { "epoch": 36.0, "eval_explained_variance": 0.44863569736480713, "eval_kl_divergence": 0.43913933634757996, "eval_loss": 0.46735426783561707, "eval_mae": 0.11842861026525497, "eval_rmse": 0.15950414538383484, "eval_runtime": 64.5571, "eval_samples_per_second": 36.464, "eval_steps_per_second": 2.293, "learning_rate": 1e-05, "step": 15768 }, { "epoch": 36.529680365296805, "grad_norm": 0.15660376846790314, "learning_rate": 1e-05, "loss": 0.4612, "step": 16000 }, { "epoch": 37.0, "eval_explained_variance": 0.44943228363990784, "eval_kl_divergence": 0.36332887411117554, "eval_loss": 0.468018501996994, "eval_mae": 0.11912700533866882, "eval_rmse": 0.16002707183361053, "eval_runtime": 63.2971, "eval_samples_per_second": 37.19, "eval_steps_per_second": 2.338, "learning_rate": 1e-05, "step": 16206 }, { "epoch": 37.67123287671233, "grad_norm": 0.15865331888198853, "learning_rate": 1e-05, "loss": 0.4625, "step": 16500 }, { "epoch": 38.0, "eval_explained_variance": 0.45025742053985596, "eval_kl_divergence": 0.43029093742370605, "eval_loss": 0.46701580286026, "eval_mae": 0.1186341941356659, "eval_rmse": 0.15923398733139038, "eval_runtime": 63.0229, "eval_samples_per_second": 37.351, "eval_steps_per_second": 2.348, "learning_rate": 1e-05, "step": 16644 }, { "epoch": 38.81278538812786, "grad_norm": 0.2913804352283478, "learning_rate": 1e-05, "loss": 0.4608, "step": 17000 }, { "epoch": 39.0, "eval_explained_variance": 0.448851078748703, "eval_kl_divergence": 0.4562166929244995, "eval_loss": 0.4673251509666443, "eval_mae": 0.11870113760232925, "eval_rmse": 0.1596096307039261, "eval_runtime": 63.132, "eval_samples_per_second": 37.287, "eval_steps_per_second": 2.344, "learning_rate": 1e-05, "step": 17082 }, { "epoch": 39.954337899543376, "grad_norm": 0.1813182234764099, "learning_rate": 1e-05, "loss": 0.4614, "step": 17500 }, { "epoch": 40.0, "eval_explained_variance": 0.449774831533432, "eval_kl_divergence": 0.40653547644615173, "eval_loss": 0.4673212468624115, "eval_mae": 0.1188703179359436, "eval_rmse": 0.15939703583717346, "eval_runtime": 65.2215, "eval_samples_per_second": 36.092, "eval_steps_per_second": 2.269, "learning_rate": 1e-05, "step": 17520 }, { "epoch": 41.0, "eval_explained_variance": 0.4507579803466797, "eval_kl_divergence": 0.3335873782634735, "eval_loss": 0.4677547216415405, "eval_mae": 0.12059084326028824, "eval_rmse": 0.159872904419899, "eval_runtime": 65.9882, "eval_samples_per_second": 35.673, "eval_steps_per_second": 2.243, "learning_rate": 1e-05, "step": 17958 }, { "epoch": 41.0958904109589, "grad_norm": 0.1584874391555786, "learning_rate": 1e-05, "loss": 0.4608, "step": 18000 }, { "epoch": 42.0, "eval_explained_variance": 0.4486294686794281, "eval_kl_divergence": 0.5311685800552368, "eval_loss": 0.4671882390975952, "eval_mae": 0.1177595853805542, "eval_rmse": 0.15967120230197906, "eval_runtime": 65.4501, "eval_samples_per_second": 35.966, "eval_steps_per_second": 2.261, "learning_rate": 1e-05, "step": 18396 }, { "epoch": 42.23744292237443, "grad_norm": 0.17140232026576996, "learning_rate": 1e-05, "loss": 0.4615, "step": 18500 }, { "epoch": 43.0, "eval_explained_variance": 0.45157137513160706, "eval_kl_divergence": 0.3923657536506653, "eval_loss": 0.46716412901878357, "eval_mae": 0.1185157299041748, "eval_rmse": 0.1592295914888382, "eval_runtime": 64.3671, "eval_samples_per_second": 36.571, "eval_steps_per_second": 2.299, "learning_rate": 1e-05, "step": 18834 }, { "epoch": 43.37899543378995, "grad_norm": 0.12803754210472107, "learning_rate": 1e-05, "loss": 0.4601, "step": 19000 }, { "epoch": 44.0, "eval_explained_variance": 0.44912728667259216, "eval_kl_divergence": 0.4258858561515808, "eval_loss": 0.4678168296813965, "eval_mae": 0.11944716423749924, "eval_rmse": 0.16020986437797546, "eval_runtime": 65.6519, "eval_samples_per_second": 35.856, "eval_steps_per_second": 2.254, "learning_rate": 1e-05, "step": 19272 }, { "epoch": 44.52054794520548, "grad_norm": 0.12536858022212982, "learning_rate": 1.0000000000000002e-06, "loss": 0.4602, "step": 19500 }, { "epoch": 45.0, "eval_explained_variance": 0.4489940404891968, "eval_kl_divergence": 0.5214298367500305, "eval_loss": 0.46699702739715576, "eval_mae": 0.11719372868537903, "eval_rmse": 0.15936775505542755, "eval_runtime": 64.8181, "eval_samples_per_second": 36.317, "eval_steps_per_second": 2.283, "learning_rate": 1.0000000000000002e-06, "step": 19710 }, { "epoch": 45.662100456621005, "grad_norm": 0.12503549456596375, "learning_rate": 1.0000000000000002e-06, "loss": 0.4616, "step": 20000 }, { "epoch": 46.0, "eval_explained_variance": 0.45176592469215393, "eval_kl_divergence": 0.4174787700176239, "eval_loss": 0.46712958812713623, "eval_mae": 0.11880326271057129, "eval_rmse": 0.1593877524137497, "eval_runtime": 64.134, "eval_samples_per_second": 36.704, "eval_steps_per_second": 2.308, "learning_rate": 1.0000000000000002e-06, "step": 20148 }, { "epoch": 46.80365296803653, "grad_norm": 0.1746779829263687, "learning_rate": 1.0000000000000002e-06, "loss": 0.4602, "step": 20500 }, { "epoch": 47.0, "eval_explained_variance": 0.4524901807308197, "eval_kl_divergence": 0.4446321427822113, "eval_loss": 0.4666382074356079, "eval_mae": 0.11884639412164688, "eval_rmse": 0.15886224806308746, "eval_runtime": 68.911, "eval_samples_per_second": 34.16, "eval_steps_per_second": 2.148, "learning_rate": 1.0000000000000002e-06, "step": 20586 }, { "epoch": 47.945205479452056, "grad_norm": 0.18253998458385468, "learning_rate": 1.0000000000000002e-06, "loss": 0.4604, "step": 21000 }, { "epoch": 48.0, "eval_explained_variance": 0.44860827922821045, "eval_kl_divergence": 0.5755118727684021, "eval_loss": 0.46714723110198975, "eval_mae": 0.11802936345338821, "eval_rmse": 0.15972274541854858, "eval_runtime": 68.5695, "eval_samples_per_second": 34.33, "eval_steps_per_second": 2.158, "learning_rate": 1.0000000000000002e-06, "step": 21024 }, { "epoch": 49.0, "eval_explained_variance": 0.4494647979736328, "eval_kl_divergence": 0.4303589463233948, "eval_loss": 0.46758702397346497, "eval_mae": 0.11922705173492432, "eval_rmse": 0.15995512902736664, "eval_runtime": 68.4997, "eval_samples_per_second": 34.365, "eval_steps_per_second": 2.161, "learning_rate": 1.0000000000000002e-06, "step": 21462 }, { "epoch": 49.08675799086758, "grad_norm": 0.1836538463830948, "learning_rate": 1.0000000000000002e-06, "loss": 0.4606, "step": 21500 }, { "epoch": 50.0, "eval_explained_variance": 0.4534037411212921, "eval_kl_divergence": 0.33374354243278503, "eval_loss": 0.46752068400382996, "eval_mae": 0.12040751427412033, "eval_rmse": 0.15945331752300262, "eval_runtime": 67.7842, "eval_samples_per_second": 34.728, "eval_steps_per_second": 2.183, "learning_rate": 1.0000000000000002e-06, "step": 21900 }, { "epoch": 50.22831050228311, "grad_norm": 0.18452928960323334, "learning_rate": 1.0000000000000002e-06, "loss": 0.4598, "step": 22000 }, { "epoch": 51.0, "eval_explained_variance": 0.4523892402648926, "eval_kl_divergence": 0.395465224981308, "eval_loss": 0.46691644191741943, "eval_mae": 0.11809410899877548, "eval_rmse": 0.1590944528579712, "eval_runtime": 68.2629, "eval_samples_per_second": 34.484, "eval_steps_per_second": 2.168, "learning_rate": 1.0000000000000002e-06, "step": 22338 }, { "epoch": 51.36986301369863, "grad_norm": 0.1816985160112381, "learning_rate": 1.0000000000000002e-06, "loss": 0.4602, "step": 22500 }, { "epoch": 52.0, "eval_explained_variance": 0.45300889015197754, "eval_kl_divergence": 0.4761090576648712, "eval_loss": 0.466439425945282, "eval_mae": 0.1174706444144249, "eval_rmse": 0.15875311195850372, "eval_runtime": 68.2396, "eval_samples_per_second": 34.496, "eval_steps_per_second": 2.169, "learning_rate": 1.0000000000000002e-06, "step": 22776 }, { "epoch": 52.51141552511415, "grad_norm": 0.17806819081306458, "learning_rate": 1.0000000000000002e-06, "loss": 0.462, "step": 23000 }, { "epoch": 53.0, "eval_explained_variance": 0.45259252190589905, "eval_kl_divergence": 0.43274176120758057, "eval_loss": 0.4667709469795227, "eval_mae": 0.11889918893575668, "eval_rmse": 0.15901200473308563, "eval_runtime": 66.8799, "eval_samples_per_second": 35.197, "eval_steps_per_second": 2.213, "learning_rate": 1.0000000000000002e-06, "step": 23214 }, { "epoch": 53.65296803652968, "grad_norm": 0.18054644763469696, "learning_rate": 1.0000000000000002e-06, "loss": 0.4604, "step": 23500 }, { "epoch": 54.0, "eval_explained_variance": 0.4532507658004761, "eval_kl_divergence": 0.3724806606769562, "eval_loss": 0.46701404452323914, "eval_mae": 0.11868719011545181, "eval_rmse": 0.15923155844211578, "eval_runtime": 73.556, "eval_samples_per_second": 32.003, "eval_steps_per_second": 2.012, "learning_rate": 1.0000000000000002e-06, "step": 23652 }, { "epoch": 54.794520547945204, "grad_norm": 0.26471829414367676, "learning_rate": 1.0000000000000002e-06, "loss": 0.461, "step": 24000 }, { "epoch": 55.0, "eval_explained_variance": 0.45088374614715576, "eval_kl_divergence": 0.38409897685050964, "eval_loss": 0.467383474111557, "eval_mae": 0.11990005522966385, "eval_rmse": 0.1595049947500229, "eval_runtime": 70.451, "eval_samples_per_second": 33.413, "eval_steps_per_second": 2.101, "learning_rate": 1.0000000000000002e-06, "step": 24090 }, { "epoch": 55.93607305936073, "grad_norm": 0.2783886194229126, "learning_rate": 1.0000000000000002e-06, "loss": 0.4599, "step": 24500 }, { "epoch": 56.0, "eval_explained_variance": 0.45115411281585693, "eval_kl_divergence": 0.3821828067302704, "eval_loss": 0.46739572286605835, "eval_mae": 0.11897724121809006, "eval_rmse": 0.15964223444461823, "eval_runtime": 69.6578, "eval_samples_per_second": 33.794, "eval_steps_per_second": 2.125, "learning_rate": 1.0000000000000002e-06, "step": 24528 }, { "epoch": 57.0, "eval_explained_variance": 0.4505263864994049, "eval_kl_divergence": 0.4674541652202606, "eval_loss": 0.46702033281326294, "eval_mae": 0.1185864806175232, "eval_rmse": 0.15932416915893555, "eval_runtime": 67.4689, "eval_samples_per_second": 34.89, "eval_steps_per_second": 2.194, "learning_rate": 1.0000000000000002e-06, "step": 24966 }, { "epoch": 57.077625570776256, "grad_norm": 0.16562320291996002, "learning_rate": 1.0000000000000002e-06, "loss": 0.4594, "step": 25000 }, { "epoch": 58.0, "eval_explained_variance": 0.4521506726741791, "eval_kl_divergence": 0.37376847863197327, "eval_loss": 0.46735846996307373, "eval_mae": 0.11891353130340576, "eval_rmse": 0.15956538915634155, "eval_runtime": 68.6492, "eval_samples_per_second": 34.29, "eval_steps_per_second": 2.156, "learning_rate": 1.0000000000000002e-06, "step": 25404 }, { "epoch": 58.21917808219178, "grad_norm": 0.21171259880065918, "learning_rate": 1.0000000000000002e-07, "loss": 0.4613, "step": 25500 }, { "epoch": 59.0, "eval_explained_variance": 0.45357391238212585, "eval_kl_divergence": 0.4204346239566803, "eval_loss": 0.46666717529296875, "eval_mae": 0.11845538765192032, "eval_rmse": 0.1589372605085373, "eval_runtime": 69.2012, "eval_samples_per_second": 34.017, "eval_steps_per_second": 2.139, "learning_rate": 1.0000000000000002e-07, "step": 25842 }, { "epoch": 59.36073059360731, "grad_norm": 0.1960112601518631, "learning_rate": 1.0000000000000002e-07, "loss": 0.4607, "step": 26000 }, { "epoch": 60.0, "eval_explained_variance": 0.4513193368911743, "eval_kl_divergence": 0.45320600271224976, "eval_loss": 0.46685320138931274, "eval_mae": 0.11779770255088806, "eval_rmse": 0.15917657315731049, "eval_runtime": 71.4331, "eval_samples_per_second": 32.954, "eval_steps_per_second": 2.072, "learning_rate": 1.0000000000000002e-07, "step": 26280 }, { "epoch": 60.50228310502283, "grad_norm": 0.2178792506456375, "learning_rate": 1.0000000000000002e-07, "loss": 0.4613, "step": 26500 }, { "epoch": 61.0, "eval_explained_variance": 0.45110437273979187, "eval_kl_divergence": 0.40322577953338623, "eval_loss": 0.46734780073165894, "eval_mae": 0.11893540620803833, "eval_rmse": 0.1595635712146759, "eval_runtime": 69.3534, "eval_samples_per_second": 33.942, "eval_steps_per_second": 2.134, "learning_rate": 1.0000000000000002e-07, "step": 26718 }, { "epoch": 61.64383561643836, "grad_norm": 0.16740958392620087, "learning_rate": 1.0000000000000002e-07, "loss": 0.4598, "step": 27000 }, { "epoch": 62.0, "eval_explained_variance": 0.4526772201061249, "eval_kl_divergence": 0.3406714200973511, "eval_loss": 0.4673011302947998, "eval_mae": 0.11888447403907776, "eval_rmse": 0.1594574898481369, "eval_runtime": 70.4024, "eval_samples_per_second": 33.436, "eval_steps_per_second": 2.102, "learning_rate": 1.0000000000000002e-07, "step": 27156 }, { "epoch": 62.0, "learning_rate": 1.0000000000000002e-07, "step": 27156, "total_flos": 6.42634409963284e+19, "train_loss": 0.466335079458891, "train_runtime": 17194.6751, "train_samples_per_second": 61.092, "train_steps_per_second": 3.821 } ], "logging_steps": 500, "max_steps": 65700, "num_input_tokens_seen": 0, "num_train_epochs": 150, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 10, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.42634409963284e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }