diff --git "a/checkpoint-6139/trainer_state.json" "b/checkpoint-6139/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-6139/trainer_state.json" @@ -0,0 +1,4428 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.999959278413487, + "eval_steps": 500, + "global_step": 6139, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0001628863460520422, + "eval_loss": 2.4029905796051025, + "eval_runtime": 101.0475, + "eval_samples_per_second": 25.582, + "eval_steps_per_second": 25.582, + "step": 1 + }, + { + "epoch": 0.0016288634605204219, + "grad_norm": 24.467580795288086, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.7479, + "step": 10 + }, + { + "epoch": 0.0032577269210408437, + "grad_norm": 25.53505516052246, + "learning_rate": 4.000000000000001e-06, + "loss": 1.7216, + "step": 20 + }, + { + "epoch": 0.004886590381561266, + "grad_norm": 18.23355484008789, + "learning_rate": 6e-06, + "loss": 1.5159, + "step": 30 + }, + { + "epoch": 0.006515453842081687, + "grad_norm": 12.464448928833008, + "learning_rate": 8.000000000000001e-06, + "loss": 1.4322, + "step": 40 + }, + { + "epoch": 0.00814431730260211, + "grad_norm": 10.985918998718262, + "learning_rate": 1e-05, + "loss": 1.3426, + "step": 50 + }, + { + "epoch": 0.009773180763122532, + "grad_norm": 9.040105819702148, + "learning_rate": 1.2e-05, + "loss": 1.314, + "step": 60 + }, + { + "epoch": 0.011402044223642953, + "grad_norm": 10.273420333862305, + "learning_rate": 1.4e-05, + "loss": 1.2463, + "step": 70 + }, + { + "epoch": 0.013030907684163375, + "grad_norm": 11.754436492919922, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.2561, + "step": 80 + }, + { + "epoch": 0.014659771144683796, + "grad_norm": 12.387304306030273, + "learning_rate": 1.8e-05, + "loss": 1.275, + "step": 90 + }, + { + "epoch": 0.01628863460520422, + "grad_norm": 9.28602409362793, + "learning_rate": 2e-05, + "loss": 1.3315, + "step": 100 + }, + { + "epoch": 0.01791749806572464, + "grad_norm": 9.231125831604004, + "learning_rate": 1.9999864687252914e-05, + "loss": 1.3106, + "step": 110 + }, + { + "epoch": 0.019546361526245063, + "grad_norm": 19.837526321411133, + "learning_rate": 1.9999458752673554e-05, + "loss": 1.342, + "step": 120 + }, + { + "epoch": 0.021175224986765485, + "grad_norm": 11.052694320678711, + "learning_rate": 1.9998782207247556e-05, + "loss": 1.2678, + "step": 130 + }, + { + "epoch": 0.022804088447285906, + "grad_norm": 14.448543548583984, + "learning_rate": 1.9997835069283954e-05, + "loss": 1.262, + "step": 140 + }, + { + "epoch": 0.024432951907806328, + "grad_norm": 8.338994979858398, + "learning_rate": 1.999661736441472e-05, + "loss": 1.2932, + "step": 150 + }, + { + "epoch": 0.02606181536832675, + "grad_norm": 10.267782211303711, + "learning_rate": 1.9995129125594058e-05, + "loss": 1.2789, + "step": 160 + }, + { + "epoch": 0.02769067882884717, + "grad_norm": 17.692113876342773, + "learning_rate": 1.999337039309749e-05, + "loss": 1.2546, + "step": 170 + }, + { + "epoch": 0.029319542289367593, + "grad_norm": 11.050065040588379, + "learning_rate": 1.9991341214520816e-05, + "loss": 1.2649, + "step": 180 + }, + { + "epoch": 0.030948405749888015, + "grad_norm": 11.198172569274902, + "learning_rate": 1.9989041644778773e-05, + "loss": 1.2018, + "step": 190 + }, + { + "epoch": 0.03257726921040844, + "grad_norm": 8.730816841125488, + "learning_rate": 1.9986471746103584e-05, + "loss": 1.3364, + "step": 200 + }, + { + "epoch": 0.03420613267092886, + "grad_norm": 12.193893432617188, + "learning_rate": 1.998363158804326e-05, + "loss": 1.3358, + "step": 210 + }, + { + "epoch": 0.03583499613144928, + "grad_norm": 12.82657241821289, + "learning_rate": 1.9980521247459714e-05, + "loss": 1.3936, + "step": 220 + }, + { + "epoch": 0.037463859591969705, + "grad_norm": 10.334324836730957, + "learning_rate": 1.9977140808526696e-05, + "loss": 1.2811, + "step": 230 + }, + { + "epoch": 0.039092723052490126, + "grad_norm": 10.822787284851074, + "learning_rate": 1.99734903627275e-05, + "loss": 1.3964, + "step": 240 + }, + { + "epoch": 0.04072158651301055, + "grad_norm": 10.038861274719238, + "learning_rate": 1.9969570008852498e-05, + "loss": 1.3229, + "step": 250 + }, + { + "epoch": 0.04235044997353097, + "grad_norm": 10.05347728729248, + "learning_rate": 1.9965379852996455e-05, + "loss": 1.2529, + "step": 260 + }, + { + "epoch": 0.04397931343405139, + "grad_norm": 15.111456871032715, + "learning_rate": 1.996092000855568e-05, + "loss": 1.3372, + "step": 270 + }, + { + "epoch": 0.04560817689457181, + "grad_norm": 8.95826244354248, + "learning_rate": 1.9956190596224923e-05, + "loss": 1.3154, + "step": 280 + }, + { + "epoch": 0.047237040355092234, + "grad_norm": 8.862021446228027, + "learning_rate": 1.9951191743994146e-05, + "loss": 1.2578, + "step": 290 + }, + { + "epoch": 0.048865903815612656, + "grad_norm": 12.974872589111328, + "learning_rate": 1.9945923587145032e-05, + "loss": 1.2848, + "step": 300 + }, + { + "epoch": 0.05049476727613308, + "grad_norm": 8.246659278869629, + "learning_rate": 1.994038626824734e-05, + "loss": 1.2349, + "step": 310 + }, + { + "epoch": 0.0521236307366535, + "grad_norm": 10.3037748336792, + "learning_rate": 1.993457993715503e-05, + "loss": 1.2731, + "step": 320 + }, + { + "epoch": 0.05375249419717392, + "grad_norm": 9.140987396240234, + "learning_rate": 1.992850475100223e-05, + "loss": 1.2673, + "step": 330 + }, + { + "epoch": 0.05538135765769434, + "grad_norm": 9.048047065734863, + "learning_rate": 1.992216087419896e-05, + "loss": 1.2842, + "step": 340 + }, + { + "epoch": 0.057010221118214764, + "grad_norm": 8.832723617553711, + "learning_rate": 1.99155484784267e-05, + "loss": 1.227, + "step": 350 + }, + { + "epoch": 0.058639084578735186, + "grad_norm": 8.928304672241211, + "learning_rate": 1.9908667742633742e-05, + "loss": 1.3681, + "step": 360 + }, + { + "epoch": 0.06026794803925561, + "grad_norm": 9.978281021118164, + "learning_rate": 1.990151885303034e-05, + "loss": 1.1123, + "step": 370 + }, + { + "epoch": 0.06189681149977603, + "grad_norm": 8.430088996887207, + "learning_rate": 1.989410200308366e-05, + "loss": 1.263, + "step": 380 + }, + { + "epoch": 0.06352567496029646, + "grad_norm": 11.750625610351562, + "learning_rate": 1.9886417393512584e-05, + "loss": 1.255, + "step": 390 + }, + { + "epoch": 0.06515453842081688, + "grad_norm": 10.856910705566406, + "learning_rate": 1.987846523228223e-05, + "loss": 1.2889, + "step": 400 + }, + { + "epoch": 0.0667834018813373, + "grad_norm": 10.42456340789795, + "learning_rate": 1.9870245734598358e-05, + "loss": 1.3936, + "step": 410 + }, + { + "epoch": 0.06841226534185772, + "grad_norm": 9.490607261657715, + "learning_rate": 1.986175912290153e-05, + "loss": 1.3335, + "step": 420 + }, + { + "epoch": 0.07004112880237814, + "grad_norm": 8.353849411010742, + "learning_rate": 1.985300562686109e-05, + "loss": 1.2302, + "step": 430 + }, + { + "epoch": 0.07166999226289857, + "grad_norm": 13.212517738342285, + "learning_rate": 1.9843985483368968e-05, + "loss": 1.38, + "step": 440 + }, + { + "epoch": 0.07329885572341899, + "grad_norm": 6.969715118408203, + "learning_rate": 1.9834698936533226e-05, + "loss": 1.2775, + "step": 450 + }, + { + "epoch": 0.07492771918393941, + "grad_norm": 7.460076332092285, + "learning_rate": 1.9825146237671513e-05, + "loss": 1.4045, + "step": 460 + }, + { + "epoch": 0.07655658264445983, + "grad_norm": 8.584343910217285, + "learning_rate": 1.9815327645304204e-05, + "loss": 1.2884, + "step": 470 + }, + { + "epoch": 0.07818544610498025, + "grad_norm": 8.771677017211914, + "learning_rate": 1.9805243425147448e-05, + "loss": 1.3839, + "step": 480 + }, + { + "epoch": 0.07981430956550067, + "grad_norm": 10.828780174255371, + "learning_rate": 1.9794893850105942e-05, + "loss": 1.3274, + "step": 490 + }, + { + "epoch": 0.0814431730260211, + "grad_norm": 8.709300994873047, + "learning_rate": 1.978427920026558e-05, + "loss": 1.2572, + "step": 500 + }, + { + "epoch": 0.0814431730260211, + "eval_loss": 1.1935244798660278, + "eval_runtime": 101.9438, + "eval_samples_per_second": 25.357, + "eval_steps_per_second": 25.357, + "step": 500 + }, + { + "epoch": 0.08307203648654152, + "grad_norm": 9.961617469787598, + "learning_rate": 1.977339976288584e-05, + "loss": 1.382, + "step": 510 + }, + { + "epoch": 0.08470089994706194, + "grad_norm": 9.85478401184082, + "learning_rate": 1.9762255832392046e-05, + "loss": 1.3768, + "step": 520 + }, + { + "epoch": 0.08632976340758236, + "grad_norm": 9.125312805175781, + "learning_rate": 1.975084771036736e-05, + "loss": 1.2088, + "step": 530 + }, + { + "epoch": 0.08795862686810278, + "grad_norm": 7.915559768676758, + "learning_rate": 1.973917570554464e-05, + "loss": 1.2951, + "step": 540 + }, + { + "epoch": 0.0895874903286232, + "grad_norm": 9.664942741394043, + "learning_rate": 1.9727240133798106e-05, + "loss": 1.3305, + "step": 550 + }, + { + "epoch": 0.09121635378914363, + "grad_norm": 10.48803997039795, + "learning_rate": 1.9715041318134756e-05, + "loss": 1.4256, + "step": 560 + }, + { + "epoch": 0.09284521724966405, + "grad_norm": 10.634197235107422, + "learning_rate": 1.9702579588685634e-05, + "loss": 1.2912, + "step": 570 + }, + { + "epoch": 0.09447408071018447, + "grad_norm": 9.283407211303711, + "learning_rate": 1.968985528269692e-05, + "loss": 1.3337, + "step": 580 + }, + { + "epoch": 0.09610294417070489, + "grad_norm": 9.688226699829102, + "learning_rate": 1.9676868744520768e-05, + "loss": 1.3231, + "step": 590 + }, + { + "epoch": 0.09773180763122531, + "grad_norm": 10.292962074279785, + "learning_rate": 1.9663620325606005e-05, + "loss": 1.2227, + "step": 600 + }, + { + "epoch": 0.09936067109174573, + "grad_norm": 5.677855014801025, + "learning_rate": 1.965011038448863e-05, + "loss": 1.4159, + "step": 610 + }, + { + "epoch": 0.10098953455226616, + "grad_norm": 7.153832912445068, + "learning_rate": 1.963633928678209e-05, + "loss": 1.3664, + "step": 620 + }, + { + "epoch": 0.10261839801278658, + "grad_norm": 12.980141639709473, + "learning_rate": 1.9622307405167395e-05, + "loss": 1.2955, + "step": 630 + }, + { + "epoch": 0.104247261473307, + "grad_norm": 10.464163780212402, + "learning_rate": 1.9608015119383036e-05, + "loss": 1.2868, + "step": 640 + }, + { + "epoch": 0.10587612493382742, + "grad_norm": 11.611323356628418, + "learning_rate": 1.9593462816214698e-05, + "loss": 1.2963, + "step": 650 + }, + { + "epoch": 0.10750498839434784, + "grad_norm": 6.969916820526123, + "learning_rate": 1.9578650889484815e-05, + "loss": 1.2765, + "step": 660 + }, + { + "epoch": 0.10913385185486826, + "grad_norm": 10.346363067626953, + "learning_rate": 1.9563579740041884e-05, + "loss": 1.3072, + "step": 670 + }, + { + "epoch": 0.11076271531538869, + "grad_norm": 10.077250480651855, + "learning_rate": 1.9548249775749623e-05, + "loss": 1.3099, + "step": 680 + }, + { + "epoch": 0.1123915787759091, + "grad_norm": 10.486137390136719, + "learning_rate": 1.9532661411475955e-05, + "loss": 1.3504, + "step": 690 + }, + { + "epoch": 0.11402044223642953, + "grad_norm": 6.885483264923096, + "learning_rate": 1.9516815069081758e-05, + "loss": 1.2268, + "step": 700 + }, + { + "epoch": 0.11564930569694995, + "grad_norm": 7.814432144165039, + "learning_rate": 1.9500711177409456e-05, + "loss": 1.394, + "step": 710 + }, + { + "epoch": 0.11727816915747037, + "grad_norm": 13.40298080444336, + "learning_rate": 1.948435017227141e-05, + "loss": 1.2634, + "step": 720 + }, + { + "epoch": 0.1189070326179908, + "grad_norm": 8.984935760498047, + "learning_rate": 1.9467732496438137e-05, + "loss": 1.2645, + "step": 730 + }, + { + "epoch": 0.12053589607851121, + "grad_norm": 8.895575523376465, + "learning_rate": 1.9450858599626304e-05, + "loss": 1.2724, + "step": 740 + }, + { + "epoch": 0.12216475953903164, + "grad_norm": 9.734760284423828, + "learning_rate": 1.9433728938486576e-05, + "loss": 1.3362, + "step": 750 + }, + { + "epoch": 0.12379362299955206, + "grad_norm": 11.392585754394531, + "learning_rate": 1.941634397659126e-05, + "loss": 1.2574, + "step": 760 + }, + { + "epoch": 0.12542248646007248, + "grad_norm": 7.931302070617676, + "learning_rate": 1.9398704184421745e-05, + "loss": 1.3003, + "step": 770 + }, + { + "epoch": 0.12705134992059292, + "grad_norm": 7.866138458251953, + "learning_rate": 1.9380810039355776e-05, + "loss": 1.2672, + "step": 780 + }, + { + "epoch": 0.12868021338111332, + "grad_norm": 12.06697940826416, + "learning_rate": 1.936266202565454e-05, + "loss": 1.4121, + "step": 790 + }, + { + "epoch": 0.13030907684163376, + "grad_norm": 10.089153289794922, + "learning_rate": 1.9344260634449556e-05, + "loss": 1.3314, + "step": 800 + }, + { + "epoch": 0.13193794030215417, + "grad_norm": 7.83293342590332, + "learning_rate": 1.9325606363729378e-05, + "loss": 1.332, + "step": 810 + }, + { + "epoch": 0.1335668037626746, + "grad_norm": 18.291975021362305, + "learning_rate": 1.930669971832613e-05, + "loss": 1.2154, + "step": 820 + }, + { + "epoch": 0.135195667223195, + "grad_norm": 9.173296928405762, + "learning_rate": 1.9287541209901842e-05, + "loss": 1.3371, + "step": 830 + }, + { + "epoch": 0.13682453068371545, + "grad_norm": 7.5841875076293945, + "learning_rate": 1.9268131356934592e-05, + "loss": 1.3549, + "step": 840 + }, + { + "epoch": 0.13845339414423585, + "grad_norm": 9.693364143371582, + "learning_rate": 1.924847068470449e-05, + "loss": 1.2978, + "step": 850 + }, + { + "epoch": 0.1400822576047563, + "grad_norm": 9.127781867980957, + "learning_rate": 1.9228559725279444e-05, + "loss": 1.261, + "step": 860 + }, + { + "epoch": 0.1417111210652767, + "grad_norm": 10.408056259155273, + "learning_rate": 1.9208399017500773e-05, + "loss": 1.2478, + "step": 870 + }, + { + "epoch": 0.14333998452579713, + "grad_norm": 9.875020980834961, + "learning_rate": 1.918798910696864e-05, + "loss": 1.3205, + "step": 880 + }, + { + "epoch": 0.14496884798631754, + "grad_norm": 8.818717956542969, + "learning_rate": 1.916733054602725e-05, + "loss": 1.389, + "step": 890 + }, + { + "epoch": 0.14659771144683797, + "grad_norm": 13.227215766906738, + "learning_rate": 1.9146423893749924e-05, + "loss": 1.3243, + "step": 900 + }, + { + "epoch": 0.14822657490735838, + "grad_norm": 7.599000930786133, + "learning_rate": 1.9125269715923983e-05, + "loss": 1.1711, + "step": 910 + }, + { + "epoch": 0.14985543836787882, + "grad_norm": 9.84504508972168, + "learning_rate": 1.910386858503541e-05, + "loss": 1.2145, + "step": 920 + }, + { + "epoch": 0.15148430182839923, + "grad_norm": 7.403562068939209, + "learning_rate": 1.908222108025336e-05, + "loss": 1.2809, + "step": 930 + }, + { + "epoch": 0.15311316528891966, + "grad_norm": 12.983716011047363, + "learning_rate": 1.9060327787414498e-05, + "loss": 1.2949, + "step": 940 + }, + { + "epoch": 0.15474202874944007, + "grad_norm": 8.185487747192383, + "learning_rate": 1.9038189299007154e-05, + "loss": 1.3662, + "step": 950 + }, + { + "epoch": 0.1563708922099605, + "grad_norm": 9.041784286499023, + "learning_rate": 1.901580621415526e-05, + "loss": 1.2835, + "step": 960 + }, + { + "epoch": 0.1579997556704809, + "grad_norm": 7.92235803604126, + "learning_rate": 1.899317913860215e-05, + "loss": 1.3625, + "step": 970 + }, + { + "epoch": 0.15962861913100135, + "grad_norm": 15.111337661743164, + "learning_rate": 1.8970308684694186e-05, + "loss": 1.2816, + "step": 980 + }, + { + "epoch": 0.16125748259152176, + "grad_norm": 10.149383544921875, + "learning_rate": 1.894719547136415e-05, + "loss": 1.2481, + "step": 990 + }, + { + "epoch": 0.1628863460520422, + "grad_norm": 10.364059448242188, + "learning_rate": 1.8923840124114517e-05, + "loss": 1.3061, + "step": 1000 + }, + { + "epoch": 0.1628863460520422, + "eval_loss": 1.1865288019180298, + "eval_runtime": 101.3159, + "eval_samples_per_second": 25.514, + "eval_steps_per_second": 25.514, + "step": 1000 + }, + { + "epoch": 0.1645152095125626, + "grad_norm": 8.624238014221191, + "learning_rate": 1.8900243275000532e-05, + "loss": 1.2933, + "step": 1010 + }, + { + "epoch": 0.16614407297308303, + "grad_norm": 9.401992797851562, + "learning_rate": 1.8876405562613088e-05, + "loss": 1.244, + "step": 1020 + }, + { + "epoch": 0.16777293643360344, + "grad_norm": 10.441336631774902, + "learning_rate": 1.8852327632061457e-05, + "loss": 1.277, + "step": 1030 + }, + { + "epoch": 0.16940179989412388, + "grad_norm": 11.799003601074219, + "learning_rate": 1.8828010134955822e-05, + "loss": 1.33, + "step": 1040 + }, + { + "epoch": 0.17103066335464429, + "grad_norm": 9.582467079162598, + "learning_rate": 1.8803453729389648e-05, + "loss": 1.4002, + "step": 1050 + }, + { + "epoch": 0.17265952681516472, + "grad_norm": 8.40396499633789, + "learning_rate": 1.8778659079921877e-05, + "loss": 1.3458, + "step": 1060 + }, + { + "epoch": 0.17428839027568513, + "grad_norm": 10.204269409179688, + "learning_rate": 1.8753626857558935e-05, + "loss": 1.3259, + "step": 1070 + }, + { + "epoch": 0.17591725373620556, + "grad_norm": 6.9873948097229, + "learning_rate": 1.8728357739736578e-05, + "loss": 1.3199, + "step": 1080 + }, + { + "epoch": 0.17754611719672597, + "grad_norm": 9.196039199829102, + "learning_rate": 1.8702852410301556e-05, + "loss": 1.337, + "step": 1090 + }, + { + "epoch": 0.1791749806572464, + "grad_norm": 8.873251914978027, + "learning_rate": 1.86771115594931e-05, + "loss": 1.2909, + "step": 1100 + }, + { + "epoch": 0.18080384411776682, + "grad_norm": 9.222848892211914, + "learning_rate": 1.865113588392427e-05, + "loss": 1.2325, + "step": 1110 + }, + { + "epoch": 0.18243270757828725, + "grad_norm": 7.872706413269043, + "learning_rate": 1.8624926086563057e-05, + "loss": 1.2469, + "step": 1120 + }, + { + "epoch": 0.18406157103880766, + "grad_norm": 9.13894271850586, + "learning_rate": 1.85984828767134e-05, + "loss": 1.2889, + "step": 1130 + }, + { + "epoch": 0.1856904344993281, + "grad_norm": 7.499070167541504, + "learning_rate": 1.8571806969995982e-05, + "loss": 1.3696, + "step": 1140 + }, + { + "epoch": 0.1873192979598485, + "grad_norm": 10.489768028259277, + "learning_rate": 1.854489908832884e-05, + "loss": 1.2415, + "step": 1150 + }, + { + "epoch": 0.18894816142036894, + "grad_norm": 9.07557201385498, + "learning_rate": 1.8517759959907845e-05, + "loss": 1.307, + "step": 1160 + }, + { + "epoch": 0.19057702488088937, + "grad_norm": 8.078954696655273, + "learning_rate": 1.849039031918701e-05, + "loss": 1.2854, + "step": 1170 + }, + { + "epoch": 0.19220588834140978, + "grad_norm": 8.329951286315918, + "learning_rate": 1.846279090685859e-05, + "loss": 1.2446, + "step": 1180 + }, + { + "epoch": 0.19383475180193022, + "grad_norm": 9.939054489135742, + "learning_rate": 1.8434962469833036e-05, + "loss": 1.4163, + "step": 1190 + }, + { + "epoch": 0.19546361526245062, + "grad_norm": 9.757793426513672, + "learning_rate": 1.8406905761218815e-05, + "loss": 1.1905, + "step": 1200 + }, + { + "epoch": 0.19709247872297106, + "grad_norm": 7.9107489585876465, + "learning_rate": 1.8378621540301976e-05, + "loss": 1.2955, + "step": 1210 + }, + { + "epoch": 0.19872134218349147, + "grad_norm": 9.854551315307617, + "learning_rate": 1.835011057252565e-05, + "loss": 1.2293, + "step": 1220 + }, + { + "epoch": 0.2003502056440119, + "grad_norm": 10.388916969299316, + "learning_rate": 1.8321373629469313e-05, + "loss": 1.337, + "step": 1230 + }, + { + "epoch": 0.2019790691045323, + "grad_norm": 9.081336975097656, + "learning_rate": 1.8292411488827906e-05, + "loss": 1.2708, + "step": 1240 + }, + { + "epoch": 0.20360793256505275, + "grad_norm": 10.85607624053955, + "learning_rate": 1.826322493439079e-05, + "loss": 1.2249, + "step": 1250 + }, + { + "epoch": 0.20523679602557315, + "grad_norm": 10.35145378112793, + "learning_rate": 1.823381475602054e-05, + "loss": 1.3433, + "step": 1260 + }, + { + "epoch": 0.2068656594860936, + "grad_norm": 12.942462921142578, + "learning_rate": 1.8204181749631557e-05, + "loss": 1.3062, + "step": 1270 + }, + { + "epoch": 0.208494522946614, + "grad_norm": 8.48989200592041, + "learning_rate": 1.8174326717168547e-05, + "loss": 1.2474, + "step": 1280 + }, + { + "epoch": 0.21012338640713443, + "grad_norm": 7.585806846618652, + "learning_rate": 1.8144250466584794e-05, + "loss": 1.3294, + "step": 1290 + }, + { + "epoch": 0.21175224986765484, + "grad_norm": 9.783578872680664, + "learning_rate": 1.8113953811820322e-05, + "loss": 1.308, + "step": 1300 + }, + { + "epoch": 0.21338111332817528, + "grad_norm": 8.406821250915527, + "learning_rate": 1.8083437572779842e-05, + "loss": 1.2757, + "step": 1310 + }, + { + "epoch": 0.21500997678869568, + "grad_norm": 11.778732299804688, + "learning_rate": 1.8052702575310588e-05, + "loss": 1.2255, + "step": 1320 + }, + { + "epoch": 0.21663884024921612, + "grad_norm": 8.81125259399414, + "learning_rate": 1.802174965117994e-05, + "loss": 1.2558, + "step": 1330 + }, + { + "epoch": 0.21826770370973653, + "grad_norm": 8.534640312194824, + "learning_rate": 1.7990579638052944e-05, + "loss": 1.3359, + "step": 1340 + }, + { + "epoch": 0.21989656717025696, + "grad_norm": 7.685283184051514, + "learning_rate": 1.795919337946962e-05, + "loss": 1.2347, + "step": 1350 + }, + { + "epoch": 0.22152543063077737, + "grad_norm": 9.883221626281738, + "learning_rate": 1.7927591724822132e-05, + "loss": 1.2605, + "step": 1360 + }, + { + "epoch": 0.2231542940912978, + "grad_norm": 6.772229194641113, + "learning_rate": 1.7895775529331835e-05, + "loss": 1.2475, + "step": 1370 + }, + { + "epoch": 0.2247831575518182, + "grad_norm": 6.82761287689209, + "learning_rate": 1.7863745654026078e-05, + "loss": 1.2686, + "step": 1380 + }, + { + "epoch": 0.22641202101233865, + "grad_norm": 13.5228853225708, + "learning_rate": 1.7831502965714958e-05, + "loss": 1.2544, + "step": 1390 + }, + { + "epoch": 0.22804088447285906, + "grad_norm": 7.083252429962158, + "learning_rate": 1.779904833696781e-05, + "loss": 1.2435, + "step": 1400 + }, + { + "epoch": 0.2296697479333795, + "grad_norm": 9.345582962036133, + "learning_rate": 1.7766382646089635e-05, + "loss": 1.3229, + "step": 1410 + }, + { + "epoch": 0.2312986113938999, + "grad_norm": 10.011900901794434, + "learning_rate": 1.77335067770973e-05, + "loss": 1.3074, + "step": 1420 + }, + { + "epoch": 0.23292747485442034, + "grad_norm": 8.366225242614746, + "learning_rate": 1.770042161969564e-05, + "loss": 1.1657, + "step": 1430 + }, + { + "epoch": 0.23455633831494074, + "grad_norm": 13.709329605102539, + "learning_rate": 1.7667128069253362e-05, + "loss": 1.3257, + "step": 1440 + }, + { + "epoch": 0.23618520177546118, + "grad_norm": 9.527036666870117, + "learning_rate": 1.763362702677882e-05, + "loss": 1.1838, + "step": 1450 + }, + { + "epoch": 0.2378140652359816, + "grad_norm": 10.506624221801758, + "learning_rate": 1.759991939889562e-05, + "loss": 1.3324, + "step": 1460 + }, + { + "epoch": 0.23944292869650202, + "grad_norm": 9.029693603515625, + "learning_rate": 1.7566006097818123e-05, + "loss": 1.2102, + "step": 1470 + }, + { + "epoch": 0.24107179215702243, + "grad_norm": 9.59033203125, + "learning_rate": 1.7531888041326715e-05, + "loss": 1.3553, + "step": 1480 + }, + { + "epoch": 0.24270065561754287, + "grad_norm": 9.097341537475586, + "learning_rate": 1.7497566152742975e-05, + "loss": 1.3165, + "step": 1490 + }, + { + "epoch": 0.24432951907806327, + "grad_norm": 19.68447494506836, + "learning_rate": 1.7463041360904714e-05, + "loss": 1.2733, + "step": 1500 + }, + { + "epoch": 0.24432951907806327, + "eval_loss": 1.1864172220230103, + "eval_runtime": 101.8915, + "eval_samples_per_second": 25.37, + "eval_steps_per_second": 25.37, + "step": 1500 + }, + { + "epoch": 0.2459583825385837, + "grad_norm": 9.446989059448242, + "learning_rate": 1.742831460014082e-05, + "loss": 1.1618, + "step": 1510 + }, + { + "epoch": 0.24758724599910412, + "grad_norm": 7.566542148590088, + "learning_rate": 1.7393386810245968e-05, + "loss": 1.271, + "step": 1520 + }, + { + "epoch": 0.24921610945962455, + "grad_norm": 12.757163047790527, + "learning_rate": 1.7358258936455203e-05, + "loss": 1.2472, + "step": 1530 + }, + { + "epoch": 0.25084497292014496, + "grad_norm": 6.86335563659668, + "learning_rate": 1.7322931929418338e-05, + "loss": 1.3589, + "step": 1540 + }, + { + "epoch": 0.2524738363806654, + "grad_norm": 8.989468574523926, + "learning_rate": 1.7287406745174253e-05, + "loss": 1.246, + "step": 1550 + }, + { + "epoch": 0.25410269984118583, + "grad_norm": 9.14529037475586, + "learning_rate": 1.7251684345125e-05, + "loss": 1.1678, + "step": 1560 + }, + { + "epoch": 0.2557315633017062, + "grad_norm": 9.417140007019043, + "learning_rate": 1.7215765696009795e-05, + "loss": 1.2109, + "step": 1570 + }, + { + "epoch": 0.25736042676222665, + "grad_norm": 8.230705261230469, + "learning_rate": 1.7179651769878854e-05, + "loss": 1.2815, + "step": 1580 + }, + { + "epoch": 0.2589892902227471, + "grad_norm": 7.2801289558410645, + "learning_rate": 1.7143343544067094e-05, + "loss": 1.3492, + "step": 1590 + }, + { + "epoch": 0.2606181536832675, + "grad_norm": 6.567676544189453, + "learning_rate": 1.7106842001167664e-05, + "loss": 1.2145, + "step": 1600 + }, + { + "epoch": 0.2622470171437879, + "grad_norm": 7.0790910720825195, + "learning_rate": 1.7070148129005373e-05, + "loss": 1.2142, + "step": 1610 + }, + { + "epoch": 0.26387588060430833, + "grad_norm": 11.59786319732666, + "learning_rate": 1.7033262920609947e-05, + "loss": 1.2607, + "step": 1620 + }, + { + "epoch": 0.26550474406482877, + "grad_norm": 7.409655570983887, + "learning_rate": 1.699618737418917e-05, + "loss": 1.246, + "step": 1630 + }, + { + "epoch": 0.2671336075253492, + "grad_norm": 7.234554767608643, + "learning_rate": 1.6958922493101844e-05, + "loss": 1.2376, + "step": 1640 + }, + { + "epoch": 0.2687624709858696, + "grad_norm": 8.587789535522461, + "learning_rate": 1.6921469285830654e-05, + "loss": 1.312, + "step": 1650 + }, + { + "epoch": 0.27039133444639, + "grad_norm": 8.513998031616211, + "learning_rate": 1.688382876595487e-05, + "loss": 1.3469, + "step": 1660 + }, + { + "epoch": 0.27202019790691045, + "grad_norm": 7.039551258087158, + "learning_rate": 1.684600195212293e-05, + "loss": 1.3468, + "step": 1670 + }, + { + "epoch": 0.2736490613674309, + "grad_norm": 11.176895141601562, + "learning_rate": 1.6807989868024845e-05, + "loss": 1.419, + "step": 1680 + }, + { + "epoch": 0.27527792482795127, + "grad_norm": 8.219011306762695, + "learning_rate": 1.676979354236452e-05, + "loss": 1.3015, + "step": 1690 + }, + { + "epoch": 0.2769067882884717, + "grad_norm": 10.750848770141602, + "learning_rate": 1.673141400883191e-05, + "loss": 1.2978, + "step": 1700 + }, + { + "epoch": 0.27853565174899214, + "grad_norm": 11.351160049438477, + "learning_rate": 1.6692852306075033e-05, + "loss": 1.2648, + "step": 1710 + }, + { + "epoch": 0.2801645152095126, + "grad_norm": 7.423020362854004, + "learning_rate": 1.665410947767188e-05, + "loss": 1.2352, + "step": 1720 + }, + { + "epoch": 0.28179337867003296, + "grad_norm": 7.212891101837158, + "learning_rate": 1.6615186572102154e-05, + "loss": 1.2899, + "step": 1730 + }, + { + "epoch": 0.2834222421305534, + "grad_norm": 8.066819190979004, + "learning_rate": 1.6576084642718915e-05, + "loss": 1.2763, + "step": 1740 + }, + { + "epoch": 0.28505110559107383, + "grad_norm": 8.984209060668945, + "learning_rate": 1.653680474772006e-05, + "loss": 1.3167, + "step": 1750 + }, + { + "epoch": 0.28667996905159426, + "grad_norm": 17.02054786682129, + "learning_rate": 1.6497347950119687e-05, + "loss": 1.2149, + "step": 1760 + }, + { + "epoch": 0.2883088325121147, + "grad_norm": 10.05130386352539, + "learning_rate": 1.645771531771933e-05, + "loss": 1.1194, + "step": 1770 + }, + { + "epoch": 0.2899376959726351, + "grad_norm": 7.705258369445801, + "learning_rate": 1.6417907923079057e-05, + "loss": 1.2395, + "step": 1780 + }, + { + "epoch": 0.2915665594331555, + "grad_norm": 10.932085037231445, + "learning_rate": 1.6377926843488462e-05, + "loss": 1.3887, + "step": 1790 + }, + { + "epoch": 0.29319542289367595, + "grad_norm": 12.111505508422852, + "learning_rate": 1.633777316093748e-05, + "loss": 1.2902, + "step": 1800 + }, + { + "epoch": 0.2948242863541964, + "grad_norm": 7.594025135040283, + "learning_rate": 1.6297447962087133e-05, + "loss": 1.3156, + "step": 1810 + }, + { + "epoch": 0.29645314981471677, + "grad_norm": 6.600276470184326, + "learning_rate": 1.625695233824011e-05, + "loss": 1.2334, + "step": 1820 + }, + { + "epoch": 0.2980820132752372, + "grad_norm": 8.619099617004395, + "learning_rate": 1.621628738531123e-05, + "loss": 1.2654, + "step": 1830 + }, + { + "epoch": 0.29971087673575764, + "grad_norm": 8.581014633178711, + "learning_rate": 1.6175454203797786e-05, + "loss": 1.3179, + "step": 1840 + }, + { + "epoch": 0.30133974019627807, + "grad_norm": 11.676705360412598, + "learning_rate": 1.6134453898749778e-05, + "loss": 1.311, + "step": 1850 + }, + { + "epoch": 0.30296860365679845, + "grad_norm": 8.072620391845703, + "learning_rate": 1.6093287579739983e-05, + "loss": 1.1559, + "step": 1860 + }, + { + "epoch": 0.3045974671173189, + "grad_norm": 8.07971477508545, + "learning_rate": 1.605195636083395e-05, + "loss": 1.3077, + "step": 1870 + }, + { + "epoch": 0.3062263305778393, + "grad_norm": 8.255348205566406, + "learning_rate": 1.6010461360559823e-05, + "loss": 1.2889, + "step": 1880 + }, + { + "epoch": 0.30785519403835976, + "grad_norm": 9.381460189819336, + "learning_rate": 1.5968803701878107e-05, + "loss": 1.2049, + "step": 1890 + }, + { + "epoch": 0.30948405749888014, + "grad_norm": 10.263215065002441, + "learning_rate": 1.5926984512151243e-05, + "loss": 1.2165, + "step": 1900 + }, + { + "epoch": 0.3111129209594006, + "grad_norm": 7.69245719909668, + "learning_rate": 1.588500492311312e-05, + "loss": 1.2796, + "step": 1910 + }, + { + "epoch": 0.312741784419921, + "grad_norm": 6.975790023803711, + "learning_rate": 1.5842866070838444e-05, + "loss": 1.306, + "step": 1920 + }, + { + "epoch": 0.31437064788044145, + "grad_norm": 5.779184818267822, + "learning_rate": 1.5800569095711983e-05, + "loss": 1.233, + "step": 1930 + }, + { + "epoch": 0.3159995113409618, + "grad_norm": 8.1918306350708, + "learning_rate": 1.575811514239772e-05, + "loss": 1.2872, + "step": 1940 + }, + { + "epoch": 0.31762837480148226, + "grad_norm": 10.501468658447266, + "learning_rate": 1.5715505359807862e-05, + "loss": 1.1615, + "step": 1950 + }, + { + "epoch": 0.3192572382620027, + "grad_norm": 5.401251792907715, + "learning_rate": 1.567274090107176e-05, + "loss": 1.2808, + "step": 1960 + }, + { + "epoch": 0.32088610172252313, + "grad_norm": 7.17600154876709, + "learning_rate": 1.5629822923504692e-05, + "loss": 1.3381, + "step": 1970 + }, + { + "epoch": 0.3225149651830435, + "grad_norm": 7.486255645751953, + "learning_rate": 1.558675258857654e-05, + "loss": 1.2904, + "step": 1980 + }, + { + "epoch": 0.32414382864356395, + "grad_norm": 11.746787071228027, + "learning_rate": 1.5543531061880374e-05, + "loss": 1.2446, + "step": 1990 + }, + { + "epoch": 0.3257726921040844, + "grad_norm": 8.739921569824219, + "learning_rate": 1.55001595131009e-05, + "loss": 1.265, + "step": 2000 + }, + { + "epoch": 0.3257726921040844, + "eval_loss": 1.175255537033081, + "eval_runtime": 102.6201, + "eval_samples_per_second": 25.19, + "eval_steps_per_second": 25.19, + "step": 2000 + }, + { + "epoch": 0.3274015555646048, + "grad_norm": 7.848710536956787, + "learning_rate": 1.5456639115982795e-05, + "loss": 1.3146, + "step": 2010 + }, + { + "epoch": 0.3290304190251252, + "grad_norm": 11.283839225769043, + "learning_rate": 1.5412971048298964e-05, + "loss": 1.0946, + "step": 2020 + }, + { + "epoch": 0.33065928248564563, + "grad_norm": 11.305326461791992, + "learning_rate": 1.536915649181864e-05, + "loss": 1.2178, + "step": 2030 + }, + { + "epoch": 0.33228814594616607, + "grad_norm": 7.214547157287598, + "learning_rate": 1.5325196632275424e-05, + "loss": 1.2246, + "step": 2040 + }, + { + "epoch": 0.3339170094066865, + "grad_norm": 8.374602317810059, + "learning_rate": 1.528109265933519e-05, + "loss": 1.2537, + "step": 2050 + }, + { + "epoch": 0.3355458728672069, + "grad_norm": 7.756241321563721, + "learning_rate": 1.5236845766563881e-05, + "loss": 1.2351, + "step": 2060 + }, + { + "epoch": 0.3371747363277273, + "grad_norm": 13.376108169555664, + "learning_rate": 1.5192457151395226e-05, + "loss": 1.2366, + "step": 2070 + }, + { + "epoch": 0.33880359978824776, + "grad_norm": 8.468932151794434, + "learning_rate": 1.5147928015098309e-05, + "loss": 1.2931, + "step": 2080 + }, + { + "epoch": 0.3404324632487682, + "grad_norm": 7.511579513549805, + "learning_rate": 1.5103259562745084e-05, + "loss": 1.3374, + "step": 2090 + }, + { + "epoch": 0.34206132670928857, + "grad_norm": 8.611263275146484, + "learning_rate": 1.5058453003177756e-05, + "loss": 1.3561, + "step": 2100 + }, + { + "epoch": 0.343690190169809, + "grad_norm": 8.382079124450684, + "learning_rate": 1.5013509548976049e-05, + "loss": 1.34, + "step": 2110 + }, + { + "epoch": 0.34531905363032944, + "grad_norm": 6.7042365074157715, + "learning_rate": 1.4968430416424417e-05, + "loss": 1.2588, + "step": 2120 + }, + { + "epoch": 0.3469479170908499, + "grad_norm": 8.887292861938477, + "learning_rate": 1.4923216825479115e-05, + "loss": 1.3223, + "step": 2130 + }, + { + "epoch": 0.34857678055137026, + "grad_norm": 6.505458831787109, + "learning_rate": 1.4877869999735175e-05, + "loss": 1.3147, + "step": 2140 + }, + { + "epoch": 0.3502056440118907, + "grad_norm": 12.044365882873535, + "learning_rate": 1.4832391166393316e-05, + "loss": 1.2917, + "step": 2150 + }, + { + "epoch": 0.35183450747241113, + "grad_norm": 7.598618507385254, + "learning_rate": 1.4786781556226713e-05, + "loss": 1.1821, + "step": 2160 + }, + { + "epoch": 0.35346337093293156, + "grad_norm": 8.489738464355469, + "learning_rate": 1.4741042403547692e-05, + "loss": 1.2837, + "step": 2170 + }, + { + "epoch": 0.35509223439345194, + "grad_norm": 12.408584594726562, + "learning_rate": 1.4695174946174334e-05, + "loss": 1.2392, + "step": 2180 + }, + { + "epoch": 0.3567210978539724, + "grad_norm": 9.772265434265137, + "learning_rate": 1.4649180425396972e-05, + "loss": 1.3963, + "step": 2190 + }, + { + "epoch": 0.3583499613144928, + "grad_norm": 8.034818649291992, + "learning_rate": 1.4603060085944594e-05, + "loss": 1.2304, + "step": 2200 + }, + { + "epoch": 0.35997882477501325, + "grad_norm": 8.509257316589355, + "learning_rate": 1.455681517595117e-05, + "loss": 1.1302, + "step": 2210 + }, + { + "epoch": 0.36160768823553363, + "grad_norm": 7.198489189147949, + "learning_rate": 1.4510446946921857e-05, + "loss": 1.3122, + "step": 2220 + }, + { + "epoch": 0.36323655169605407, + "grad_norm": 13.541308403015137, + "learning_rate": 1.4463956653699148e-05, + "loss": 1.2654, + "step": 2230 + }, + { + "epoch": 0.3648654151565745, + "grad_norm": 9.754889488220215, + "learning_rate": 1.4417345554428898e-05, + "loss": 1.2936, + "step": 2240 + }, + { + "epoch": 0.36649427861709494, + "grad_norm": 11.386918067932129, + "learning_rate": 1.437061491052629e-05, + "loss": 1.2025, + "step": 2250 + }, + { + "epoch": 0.3681231420776153, + "grad_norm": 9.887473106384277, + "learning_rate": 1.4323765986641681e-05, + "loss": 1.3471, + "step": 2260 + }, + { + "epoch": 0.36975200553813575, + "grad_norm": 11.245658874511719, + "learning_rate": 1.4276800050626385e-05, + "loss": 1.3279, + "step": 2270 + }, + { + "epoch": 0.3713808689986562, + "grad_norm": 5.411240100860596, + "learning_rate": 1.4229718373498371e-05, + "loss": 1.2211, + "step": 2280 + }, + { + "epoch": 0.3730097324591766, + "grad_norm": 9.82564926147461, + "learning_rate": 1.4182522229407854e-05, + "loss": 1.0828, + "step": 2290 + }, + { + "epoch": 0.374638595919697, + "grad_norm": 10.895565032958984, + "learning_rate": 1.413521289560281e-05, + "loss": 1.2662, + "step": 2300 + }, + { + "epoch": 0.37626745938021744, + "grad_norm": 8.565170288085938, + "learning_rate": 1.4087791652394427e-05, + "loss": 1.1959, + "step": 2310 + }, + { + "epoch": 0.3778963228407379, + "grad_norm": 8.679253578186035, + "learning_rate": 1.404025978312244e-05, + "loss": 1.3137, + "step": 2320 + }, + { + "epoch": 0.3795251863012583, + "grad_norm": 7.74443244934082, + "learning_rate": 1.3992618574120415e-05, + "loss": 1.2227, + "step": 2330 + }, + { + "epoch": 0.38115404976177875, + "grad_norm": 6.578245639801025, + "learning_rate": 1.3944869314680922e-05, + "loss": 1.2857, + "step": 2340 + }, + { + "epoch": 0.3827829132222991, + "grad_norm": 7.3876237869262695, + "learning_rate": 1.3897013297020651e-05, + "loss": 1.3174, + "step": 2350 + }, + { + "epoch": 0.38441177668281956, + "grad_norm": 7.339137077331543, + "learning_rate": 1.3849051816245451e-05, + "loss": 1.281, + "step": 2360 + }, + { + "epoch": 0.38604064014334, + "grad_norm": 9.710099220275879, + "learning_rate": 1.3800986170315263e-05, + "loss": 1.3175, + "step": 2370 + }, + { + "epoch": 0.38766950360386043, + "grad_norm": 8.572495460510254, + "learning_rate": 1.3752817660009004e-05, + "loss": 1.3693, + "step": 2380 + }, + { + "epoch": 0.3892983670643808, + "grad_norm": 8.108867645263672, + "learning_rate": 1.3704547588889368e-05, + "loss": 1.3073, + "step": 2390 + }, + { + "epoch": 0.39092723052490125, + "grad_norm": 6.642714500427246, + "learning_rate": 1.3656177263267534e-05, + "loss": 1.2876, + "step": 2400 + }, + { + "epoch": 0.3925560939854217, + "grad_norm": 8.35530948638916, + "learning_rate": 1.3607707992167836e-05, + "loss": 1.3244, + "step": 2410 + }, + { + "epoch": 0.3941849574459421, + "grad_norm": 13.95084285736084, + "learning_rate": 1.3559141087292313e-05, + "loss": 1.2599, + "step": 2420 + }, + { + "epoch": 0.3958138209064625, + "grad_norm": 9.53144645690918, + "learning_rate": 1.3510477862985233e-05, + "loss": 1.1835, + "step": 2430 + }, + { + "epoch": 0.39744268436698293, + "grad_norm": 7.759304046630859, + "learning_rate": 1.3461719636197503e-05, + "loss": 1.2536, + "step": 2440 + }, + { + "epoch": 0.39907154782750337, + "grad_norm": 12.239033699035645, + "learning_rate": 1.3412867726451051e-05, + "loss": 1.2358, + "step": 2450 + }, + { + "epoch": 0.4007004112880238, + "grad_norm": 8.611287117004395, + "learning_rate": 1.3363923455803098e-05, + "loss": 1.273, + "step": 2460 + }, + { + "epoch": 0.4023292747485442, + "grad_norm": 9.023387908935547, + "learning_rate": 1.3314888148810381e-05, + "loss": 1.2195, + "step": 2470 + }, + { + "epoch": 0.4039581382090646, + "grad_norm": 10.45922565460205, + "learning_rate": 1.3265763132493325e-05, + "loss": 1.2142, + "step": 2480 + }, + { + "epoch": 0.40558700166958506, + "grad_norm": 9.23582649230957, + "learning_rate": 1.3216549736300108e-05, + "loss": 1.3208, + "step": 2490 + }, + { + "epoch": 0.4072158651301055, + "grad_norm": 12.40174674987793, + "learning_rate": 1.3167249292070701e-05, + "loss": 1.2436, + "step": 2500 + }, + { + "epoch": 0.4072158651301055, + "eval_loss": 1.1542352437973022, + "eval_runtime": 102.4621, + "eval_samples_per_second": 25.229, + "eval_steps_per_second": 25.229, + "step": 2500 + }, + { + "epoch": 0.40884472859062587, + "grad_norm": 11.083693504333496, + "learning_rate": 1.311786313400081e-05, + "loss": 1.1723, + "step": 2510 + }, + { + "epoch": 0.4104735920511463, + "grad_norm": 6.602702617645264, + "learning_rate": 1.3068392598605775e-05, + "loss": 1.3112, + "step": 2520 + }, + { + "epoch": 0.41210245551166674, + "grad_norm": 5.926374912261963, + "learning_rate": 1.3018839024684407e-05, + "loss": 1.1096, + "step": 2530 + }, + { + "epoch": 0.4137313189721872, + "grad_norm": 8.42911148071289, + "learning_rate": 1.296920375328275e-05, + "loss": 1.3634, + "step": 2540 + }, + { + "epoch": 0.41536018243270756, + "grad_norm": 7.443211555480957, + "learning_rate": 1.2919488127657788e-05, + "loss": 1.1775, + "step": 2550 + }, + { + "epoch": 0.416989045893228, + "grad_norm": 7.255467414855957, + "learning_rate": 1.28696934932411e-05, + "loss": 1.2215, + "step": 2560 + }, + { + "epoch": 0.41861790935374843, + "grad_norm": 7.436680316925049, + "learning_rate": 1.2819821197602434e-05, + "loss": 1.3608, + "step": 2570 + }, + { + "epoch": 0.42024677281426887, + "grad_norm": 9.471477508544922, + "learning_rate": 1.2769872590413262e-05, + "loss": 1.2015, + "step": 2580 + }, + { + "epoch": 0.42187563627478925, + "grad_norm": 10.229029655456543, + "learning_rate": 1.271984902341023e-05, + "loss": 1.1578, + "step": 2590 + }, + { + "epoch": 0.4235044997353097, + "grad_norm": 9.242801666259766, + "learning_rate": 1.2669751850358593e-05, + "loss": 1.2569, + "step": 2600 + }, + { + "epoch": 0.4251333631958301, + "grad_norm": 7.492494106292725, + "learning_rate": 1.2619582427015575e-05, + "loss": 1.1613, + "step": 2610 + }, + { + "epoch": 0.42676222665635055, + "grad_norm": 9.39388370513916, + "learning_rate": 1.256934211109367e-05, + "loss": 1.2344, + "step": 2620 + }, + { + "epoch": 0.42839109011687093, + "grad_norm": 17.988927841186523, + "learning_rate": 1.2519032262223913e-05, + "loss": 1.2001, + "step": 2630 + }, + { + "epoch": 0.43001995357739137, + "grad_norm": 8.884842872619629, + "learning_rate": 1.2468654241919077e-05, + "loss": 1.3394, + "step": 2640 + }, + { + "epoch": 0.4316488170379118, + "grad_norm": 5.588969707489014, + "learning_rate": 1.2418209413536822e-05, + "loss": 1.2306, + "step": 2650 + }, + { + "epoch": 0.43327768049843224, + "grad_norm": 9.964385032653809, + "learning_rate": 1.2367699142242808e-05, + "loss": 1.3004, + "step": 2660 + }, + { + "epoch": 0.4349065439589526, + "grad_norm": 9.903732299804688, + "learning_rate": 1.2317124794973757e-05, + "loss": 1.2649, + "step": 2670 + }, + { + "epoch": 0.43653540741947305, + "grad_norm": 8.422842979431152, + "learning_rate": 1.2266487740400432e-05, + "loss": 1.3842, + "step": 2680 + }, + { + "epoch": 0.4381642708799935, + "grad_norm": 6.042238235473633, + "learning_rate": 1.2215789348890627e-05, + "loss": 1.1659, + "step": 2690 + }, + { + "epoch": 0.4397931343405139, + "grad_norm": 10.695905685424805, + "learning_rate": 1.216503099247207e-05, + "loss": 1.1943, + "step": 2700 + }, + { + "epoch": 0.4414219978010343, + "grad_norm": 7.222139835357666, + "learning_rate": 1.2114214044795287e-05, + "loss": 1.1867, + "step": 2710 + }, + { + "epoch": 0.44305086126155474, + "grad_norm": 8.005696296691895, + "learning_rate": 1.206333988109644e-05, + "loss": 1.2714, + "step": 2720 + }, + { + "epoch": 0.4446797247220752, + "grad_norm": 10.262436866760254, + "learning_rate": 1.2012409878160093e-05, + "loss": 1.2536, + "step": 2730 + }, + { + "epoch": 0.4463085881825956, + "grad_norm": 5.630941867828369, + "learning_rate": 1.196142541428197e-05, + "loss": 1.2511, + "step": 2740 + }, + { + "epoch": 0.447937451643116, + "grad_norm": 5.890045642852783, + "learning_rate": 1.1910387869231646e-05, + "loss": 1.204, + "step": 2750 + }, + { + "epoch": 0.4495663151036364, + "grad_norm": 7.118770122528076, + "learning_rate": 1.1859298624215202e-05, + "loss": 1.2963, + "step": 2760 + }, + { + "epoch": 0.45119517856415686, + "grad_norm": 6.118231773376465, + "learning_rate": 1.180815906183786e-05, + "loss": 1.1031, + "step": 2770 + }, + { + "epoch": 0.4528240420246773, + "grad_norm": 9.652909278869629, + "learning_rate": 1.175697056606655e-05, + "loss": 1.2406, + "step": 2780 + }, + { + "epoch": 0.4544529054851977, + "grad_norm": 8.817192077636719, + "learning_rate": 1.170573452219247e-05, + "loss": 1.1656, + "step": 2790 + }, + { + "epoch": 0.4560817689457181, + "grad_norm": 7.891348838806152, + "learning_rate": 1.1654452316793592e-05, + "loss": 1.1508, + "step": 2800 + }, + { + "epoch": 0.45771063240623855, + "grad_norm": 7.836014270782471, + "learning_rate": 1.1603125337697129e-05, + "loss": 1.2084, + "step": 2810 + }, + { + "epoch": 0.459339495866759, + "grad_norm": 7.247401714324951, + "learning_rate": 1.1551754973941996e-05, + "loss": 1.2001, + "step": 2820 + }, + { + "epoch": 0.46096835932727936, + "grad_norm": 7.9783830642700195, + "learning_rate": 1.1500342615741193e-05, + "loss": 1.2263, + "step": 2830 + }, + { + "epoch": 0.4625972227877998, + "grad_norm": 8.071614265441895, + "learning_rate": 1.144888965444421e-05, + "loss": 1.1689, + "step": 2840 + }, + { + "epoch": 0.46422608624832024, + "grad_norm": 13.862833976745605, + "learning_rate": 1.1397397482499352e-05, + "loss": 1.2704, + "step": 2850 + }, + { + "epoch": 0.46585494970884067, + "grad_norm": 10.272599220275879, + "learning_rate": 1.1345867493416067e-05, + "loss": 1.3094, + "step": 2860 + }, + { + "epoch": 0.46748381316936105, + "grad_norm": 8.56863784790039, + "learning_rate": 1.1294301081727235e-05, + "loss": 1.2395, + "step": 2870 + }, + { + "epoch": 0.4691126766298815, + "grad_norm": 9.376766204833984, + "learning_rate": 1.1242699642951411e-05, + "loss": 1.2032, + "step": 2880 + }, + { + "epoch": 0.4707415400904019, + "grad_norm": 10.573633193969727, + "learning_rate": 1.1191064573555094e-05, + "loss": 1.2421, + "step": 2890 + }, + { + "epoch": 0.47237040355092236, + "grad_norm": 10.285879135131836, + "learning_rate": 1.1139397270914893e-05, + "loss": 1.3089, + "step": 2900 + }, + { + "epoch": 0.4739992670114428, + "grad_norm": 6.976883888244629, + "learning_rate": 1.1087699133279743e-05, + "loss": 1.1944, + "step": 2910 + }, + { + "epoch": 0.4756281304719632, + "grad_norm": 8.282888412475586, + "learning_rate": 1.1035971559733047e-05, + "loss": 1.2164, + "step": 2920 + }, + { + "epoch": 0.4772569939324836, + "grad_norm": 12.169346809387207, + "learning_rate": 1.0984215950154821e-05, + "loss": 1.2526, + "step": 2930 + }, + { + "epoch": 0.47888585739300404, + "grad_norm": 10.577465057373047, + "learning_rate": 1.0932433705183806e-05, + "loss": 1.252, + "step": 2940 + }, + { + "epoch": 0.4805147208535245, + "grad_norm": 10.231258392333984, + "learning_rate": 1.0880626226179566e-05, + "loss": 1.2123, + "step": 2950 + }, + { + "epoch": 0.48214358431404486, + "grad_norm": 9.532476425170898, + "learning_rate": 1.0828794915184556e-05, + "loss": 1.0514, + "step": 2960 + }, + { + "epoch": 0.4837724477745653, + "grad_norm": 7.888957500457764, + "learning_rate": 1.0776941174886204e-05, + "loss": 1.1282, + "step": 2970 + }, + { + "epoch": 0.48540131123508573, + "grad_norm": 13.633064270019531, + "learning_rate": 1.072506640857891e-05, + "loss": 1.2246, + "step": 2980 + }, + { + "epoch": 0.48703017469560617, + "grad_norm": 10.267495155334473, + "learning_rate": 1.06731720201261e-05, + "loss": 1.2691, + "step": 2990 + }, + { + "epoch": 0.48865903815612655, + "grad_norm": 11.644023895263672, + "learning_rate": 1.0621259413922234e-05, + "loss": 1.2935, + "step": 3000 + }, + { + "epoch": 0.48865903815612655, + "eval_loss": 1.1448081731796265, + "eval_runtime": 102.8493, + "eval_samples_per_second": 25.134, + "eval_steps_per_second": 25.134, + "step": 3000 + }, + { + "epoch": 0.490287901616647, + "grad_norm": 9.024466514587402, + "learning_rate": 1.056932999485477e-05, + "loss": 1.1484, + "step": 3010 + }, + { + "epoch": 0.4919167650771674, + "grad_norm": 5.3878655433654785, + "learning_rate": 1.0517385168266193e-05, + "loss": 1.1989, + "step": 3020 + }, + { + "epoch": 0.49354562853768785, + "grad_norm": 7.960078716278076, + "learning_rate": 1.0465426339915927e-05, + "loss": 1.28, + "step": 3030 + }, + { + "epoch": 0.49517449199820823, + "grad_norm": 9.044086456298828, + "learning_rate": 1.041345491594234e-05, + "loss": 1.1462, + "step": 3040 + }, + { + "epoch": 0.49680335545872867, + "grad_norm": 9.06221866607666, + "learning_rate": 1.0361472302824656e-05, + "loss": 1.2558, + "step": 3050 + }, + { + "epoch": 0.4984322189192491, + "grad_norm": 7.99545955657959, + "learning_rate": 1.0309479907344915e-05, + "loss": 1.3108, + "step": 3060 + }, + { + "epoch": 0.5000610823797695, + "grad_norm": 11.91790771484375, + "learning_rate": 1.0257479136549889e-05, + "loss": 1.3106, + "step": 3070 + }, + { + "epoch": 0.5016899458402899, + "grad_norm": 9.534984588623047, + "learning_rate": 1.0205471397713002e-05, + "loss": 1.2752, + "step": 3080 + }, + { + "epoch": 0.5033188093008104, + "grad_norm": 8.188611030578613, + "learning_rate": 1.0153458098296265e-05, + "loss": 1.2704, + "step": 3090 + }, + { + "epoch": 0.5049476727613308, + "grad_norm": 10.754870414733887, + "learning_rate": 1.0101440645912156e-05, + "loss": 1.2822, + "step": 3100 + }, + { + "epoch": 0.5065765362218512, + "grad_norm": 9.690641403198242, + "learning_rate": 1.0049420448285554e-05, + "loss": 1.1982, + "step": 3110 + }, + { + "epoch": 0.5082053996823717, + "grad_norm": 7.106996059417725, + "learning_rate": 9.997398913215629e-06, + "loss": 1.1991, + "step": 3120 + }, + { + "epoch": 0.509834263142892, + "grad_norm": 9.254938125610352, + "learning_rate": 9.945377448537744e-06, + "loss": 1.356, + "step": 3130 + }, + { + "epoch": 0.5114631266034124, + "grad_norm": 8.23971939086914, + "learning_rate": 9.893357462085355e-06, + "loss": 1.1756, + "step": 3140 + }, + { + "epoch": 0.5130919900639329, + "grad_norm": 7.241464614868164, + "learning_rate": 9.841340361651921e-06, + "loss": 1.1899, + "step": 3150 + }, + { + "epoch": 0.5147208535244533, + "grad_norm": 10.916207313537598, + "learning_rate": 9.78932755495279e-06, + "loss": 1.261, + "step": 3160 + }, + { + "epoch": 0.5163497169849738, + "grad_norm": 10.14999771118164, + "learning_rate": 9.737320449587113e-06, + "loss": 1.244, + "step": 3170 + }, + { + "epoch": 0.5179785804454942, + "grad_norm": 10.535028457641602, + "learning_rate": 9.68532045299975e-06, + "loss": 1.2962, + "step": 3180 + }, + { + "epoch": 0.5196074439060145, + "grad_norm": 12.43609619140625, + "learning_rate": 9.63332897244318e-06, + "loss": 1.2515, + "step": 3190 + }, + { + "epoch": 0.521236307366535, + "grad_norm": 8.834859848022461, + "learning_rate": 9.581347414939416e-06, + "loss": 1.2559, + "step": 3200 + }, + { + "epoch": 0.5228651708270554, + "grad_norm": 9.219295501708984, + "learning_rate": 9.529377187241921e-06, + "loss": 1.0892, + "step": 3210 + }, + { + "epoch": 0.5244940342875758, + "grad_norm": 6.687707424163818, + "learning_rate": 9.477419695797551e-06, + "loss": 1.2412, + "step": 3220 + }, + { + "epoch": 0.5261228977480963, + "grad_norm": 7.913536548614502, + "learning_rate": 9.425476346708489e-06, + "loss": 1.3012, + "step": 3230 + }, + { + "epoch": 0.5277517612086167, + "grad_norm": 10.74738597869873, + "learning_rate": 9.373548545694189e-06, + "loss": 1.2319, + "step": 3240 + }, + { + "epoch": 0.5293806246691372, + "grad_norm": 8.231693267822266, + "learning_rate": 9.321637698053327e-06, + "loss": 1.1856, + "step": 3250 + }, + { + "epoch": 0.5310094881296575, + "grad_norm": 10.926920890808105, + "learning_rate": 9.269745208625784e-06, + "loss": 1.3557, + "step": 3260 + }, + { + "epoch": 0.5326383515901779, + "grad_norm": 7.84801721572876, + "learning_rate": 9.217872481754619e-06, + "loss": 1.2767, + "step": 3270 + }, + { + "epoch": 0.5342672150506984, + "grad_norm": 9.671969413757324, + "learning_rate": 9.16602092124807e-06, + "loss": 1.2934, + "step": 3280 + }, + { + "epoch": 0.5358960785112188, + "grad_norm": 11.278433799743652, + "learning_rate": 9.11419193034155e-06, + "loss": 1.2475, + "step": 3290 + }, + { + "epoch": 0.5375249419717392, + "grad_norm": 8.938894271850586, + "learning_rate": 9.062386911659692e-06, + "loss": 1.2476, + "step": 3300 + }, + { + "epoch": 0.5391538054322597, + "grad_norm": 10.644549369812012, + "learning_rate": 9.010607267178372e-06, + "loss": 1.2407, + "step": 3310 + }, + { + "epoch": 0.54078266889278, + "grad_norm": 10.27429485321045, + "learning_rate": 8.958854398186774e-06, + "loss": 1.3306, + "step": 3320 + }, + { + "epoch": 0.5424115323533005, + "grad_norm": 5.996701717376709, + "learning_rate": 8.90712970524948e-06, + "loss": 1.162, + "step": 3330 + }, + { + "epoch": 0.5440403958138209, + "grad_norm": 11.066450119018555, + "learning_rate": 8.855434588168543e-06, + "loss": 1.16, + "step": 3340 + }, + { + "epoch": 0.5456692592743413, + "grad_norm": 8.821560859680176, + "learning_rate": 8.803770445945626e-06, + "loss": 1.2471, + "step": 3350 + }, + { + "epoch": 0.5472981227348618, + "grad_norm": 8.159625053405762, + "learning_rate": 8.752138676744128e-06, + "loss": 1.1295, + "step": 3360 + }, + { + "epoch": 0.5489269861953822, + "grad_norm": 10.010625839233398, + "learning_rate": 8.70054067785136e-06, + "loss": 1.1393, + "step": 3370 + }, + { + "epoch": 0.5505558496559025, + "grad_norm": 8.413679122924805, + "learning_rate": 8.648977845640713e-06, + "loss": 1.3249, + "step": 3380 + }, + { + "epoch": 0.552184713116423, + "grad_norm": 7.034830570220947, + "learning_rate": 8.597451575533884e-06, + "loss": 1.1537, + "step": 3390 + }, + { + "epoch": 0.5538135765769434, + "grad_norm": 7.142354488372803, + "learning_rate": 8.545963261963102e-06, + "loss": 1.3551, + "step": 3400 + }, + { + "epoch": 0.5554424400374639, + "grad_norm": 11.520668029785156, + "learning_rate": 8.494514298333401e-06, + "loss": 1.2437, + "step": 3410 + }, + { + "epoch": 0.5570713034979843, + "grad_norm": 9.558789253234863, + "learning_rate": 8.443106076984895e-06, + "loss": 1.3416, + "step": 3420 + }, + { + "epoch": 0.5587001669585047, + "grad_norm": 7.987767696380615, + "learning_rate": 8.39173998915512e-06, + "loss": 1.1504, + "step": 3430 + }, + { + "epoch": 0.5603290304190252, + "grad_norm": 8.003718376159668, + "learning_rate": 8.340417424941363e-06, + "loss": 1.1578, + "step": 3440 + }, + { + "epoch": 0.5619578938795455, + "grad_norm": 9.900299072265625, + "learning_rate": 8.289139773263057e-06, + "loss": 1.2571, + "step": 3450 + }, + { + "epoch": 0.5635867573400659, + "grad_norm": 7.391176700592041, + "learning_rate": 8.237908421824186e-06, + "loss": 1.3128, + "step": 3460 + }, + { + "epoch": 0.5652156208005864, + "grad_norm": 7.9699387550354, + "learning_rate": 8.186724757075725e-06, + "loss": 1.2942, + "step": 3470 + }, + { + "epoch": 0.5668444842611068, + "grad_norm": 13.502717018127441, + "learning_rate": 8.135590164178136e-06, + "loss": 1.157, + "step": 3480 + }, + { + "epoch": 0.5684733477216273, + "grad_norm": 10.46247673034668, + "learning_rate": 8.084506026963859e-06, + "loss": 1.1876, + "step": 3490 + }, + { + "epoch": 0.5701022111821477, + "grad_norm": 6.954629898071289, + "learning_rate": 8.033473727899889e-06, + "loss": 1.2595, + "step": 3500 + }, + { + "epoch": 0.5701022111821477, + "eval_loss": 1.13480544090271, + "eval_runtime": 102.3333, + "eval_samples_per_second": 25.261, + "eval_steps_per_second": 25.261, + "step": 3500 + }, + { + "epoch": 0.571731074642668, + "grad_norm": 9.53747272491455, + "learning_rate": 7.982494648050341e-06, + "loss": 1.3107, + "step": 3510 + }, + { + "epoch": 0.5733599381031885, + "grad_norm": 9.204121589660645, + "learning_rate": 7.93157016703908e-06, + "loss": 1.3048, + "step": 3520 + }, + { + "epoch": 0.5749888015637089, + "grad_norm": 8.356864929199219, + "learning_rate": 7.880701663012387e-06, + "loss": 1.1239, + "step": 3530 + }, + { + "epoch": 0.5766176650242294, + "grad_norm": 6.577971935272217, + "learning_rate": 7.829890512601672e-06, + "loss": 1.2206, + "step": 3540 + }, + { + "epoch": 0.5782465284847498, + "grad_norm": 7.473880767822266, + "learning_rate": 7.779138090886202e-06, + "loss": 1.1229, + "step": 3550 + }, + { + "epoch": 0.5798753919452702, + "grad_norm": 10.256245613098145, + "learning_rate": 7.728445771355897e-06, + "loss": 1.2466, + "step": 3560 + }, + { + "epoch": 0.5815042554057906, + "grad_norm": 3.6034979820251465, + "learning_rate": 7.677814925874159e-06, + "loss": 1.1838, + "step": 3570 + }, + { + "epoch": 0.583133118866311, + "grad_norm": 13.721611976623535, + "learning_rate": 7.627246924640744e-06, + "loss": 1.2497, + "step": 3580 + }, + { + "epoch": 0.5847619823268314, + "grad_norm": 12.126471519470215, + "learning_rate": 7.57674313615469e-06, + "loss": 1.2542, + "step": 3590 + }, + { + "epoch": 0.5863908457873519, + "grad_norm": 9.544647216796875, + "learning_rate": 7.5263049271772645e-06, + "loss": 1.2369, + "step": 3600 + }, + { + "epoch": 0.5880197092478723, + "grad_norm": 9.008516311645508, + "learning_rate": 7.475933662694993e-06, + "loss": 1.1323, + "step": 3610 + }, + { + "epoch": 0.5896485727083928, + "grad_norm": 14.236581802368164, + "learning_rate": 7.425630705882707e-06, + "loss": 1.1859, + "step": 3620 + }, + { + "epoch": 0.5912774361689132, + "grad_norm": 7.743088722229004, + "learning_rate": 7.375397418066665e-06, + "loss": 1.2496, + "step": 3630 + }, + { + "epoch": 0.5929062996294335, + "grad_norm": 10.839030265808105, + "learning_rate": 7.3252351586876955e-06, + "loss": 1.2483, + "step": 3640 + }, + { + "epoch": 0.594535163089954, + "grad_norm": 10.632163047790527, + "learning_rate": 7.275145285264424e-06, + "loss": 1.2619, + "step": 3650 + }, + { + "epoch": 0.5961640265504744, + "grad_norm": 8.51111125946045, + "learning_rate": 7.2251291533565245e-06, + "loss": 1.2082, + "step": 3660 + }, + { + "epoch": 0.5977928900109948, + "grad_norm": 8.234833717346191, + "learning_rate": 7.175188116528044e-06, + "loss": 1.2497, + "step": 3670 + }, + { + "epoch": 0.5994217534715153, + "grad_norm": 12.51526927947998, + "learning_rate": 7.125323526310752e-06, + "loss": 1.2674, + "step": 3680 + }, + { + "epoch": 0.6010506169320357, + "grad_norm": 8.174943923950195, + "learning_rate": 7.0755367321675915e-06, + "loss": 1.1269, + "step": 3690 + }, + { + "epoch": 0.6026794803925561, + "grad_norm": 8.4658842086792, + "learning_rate": 7.025829081456137e-06, + "loss": 1.2499, + "step": 3700 + }, + { + "epoch": 0.6043083438530765, + "grad_norm": 7.7957987785339355, + "learning_rate": 6.976201919392138e-06, + "loss": 1.2735, + "step": 3710 + }, + { + "epoch": 0.6059372073135969, + "grad_norm": 7.680945873260498, + "learning_rate": 6.926656589013127e-06, + "loss": 1.2044, + "step": 3720 + }, + { + "epoch": 0.6075660707741174, + "grad_norm": 8.13155460357666, + "learning_rate": 6.877194431142055e-06, + "loss": 1.2299, + "step": 3730 + }, + { + "epoch": 0.6091949342346378, + "grad_norm": 7.943605899810791, + "learning_rate": 6.827816784351011e-06, + "loss": 1.1769, + "step": 3740 + }, + { + "epoch": 0.6108237976951582, + "grad_norm": 8.881424903869629, + "learning_rate": 6.778524984924999e-06, + "loss": 1.1524, + "step": 3750 + }, + { + "epoch": 0.6124526611556786, + "grad_norm": 9.679656028747559, + "learning_rate": 6.729320366825785e-06, + "loss": 1.2131, + "step": 3760 + }, + { + "epoch": 0.614081524616199, + "grad_norm": 8.224056243896484, + "learning_rate": 6.68020426165577e-06, + "loss": 1.254, + "step": 3770 + }, + { + "epoch": 0.6157103880767195, + "grad_norm": 7.536296844482422, + "learning_rate": 6.631177998621982e-06, + "loss": 1.2487, + "step": 3780 + }, + { + "epoch": 0.6173392515372399, + "grad_norm": 8.117711067199707, + "learning_rate": 6.582242904500085e-06, + "loss": 1.2872, + "step": 3790 + }, + { + "epoch": 0.6189681149977603, + "grad_norm": 9.820115089416504, + "learning_rate": 6.53340030359848e-06, + "loss": 1.3597, + "step": 3800 + }, + { + "epoch": 0.6205969784582808, + "grad_norm": 8.39965534210205, + "learning_rate": 6.4846515177224735e-06, + "loss": 1.2167, + "step": 3810 + }, + { + "epoch": 0.6222258419188011, + "grad_norm": 14.166945457458496, + "learning_rate": 6.435997866138488e-06, + "loss": 1.347, + "step": 3820 + }, + { + "epoch": 0.6238547053793215, + "grad_norm": 7.368659496307373, + "learning_rate": 6.3874406655383755e-06, + "loss": 1.1975, + "step": 3830 + }, + { + "epoch": 0.625483568839842, + "grad_norm": 6.8791913986206055, + "learning_rate": 6.3389812300037774e-06, + "loss": 1.3515, + "step": 3840 + }, + { + "epoch": 0.6271124323003624, + "grad_norm": 8.488605499267578, + "learning_rate": 6.290620870970561e-06, + "loss": 1.2868, + "step": 3850 + }, + { + "epoch": 0.6287412957608829, + "grad_norm": 11.543606758117676, + "learning_rate": 6.242360897193331e-06, + "loss": 1.1796, + "step": 3860 + }, + { + "epoch": 0.6303701592214033, + "grad_norm": 7.279272556304932, + "learning_rate": 6.194202614710015e-06, + "loss": 1.1563, + "step": 3870 + }, + { + "epoch": 0.6319990226819237, + "grad_norm": 6.415475368499756, + "learning_rate": 6.146147326806509e-06, + "loss": 1.246, + "step": 3880 + }, + { + "epoch": 0.6336278861424441, + "grad_norm": 8.384225845336914, + "learning_rate": 6.098196333981421e-06, + "loss": 1.2252, + "step": 3890 + }, + { + "epoch": 0.6352567496029645, + "grad_norm": 7.170252323150635, + "learning_rate": 6.050350933910865e-06, + "loss": 1.0904, + "step": 3900 + }, + { + "epoch": 0.6368856130634849, + "grad_norm": 11.731266021728516, + "learning_rate": 6.002612421413341e-06, + "loss": 1.196, + "step": 3910 + }, + { + "epoch": 0.6385144765240054, + "grad_norm": 8.852503776550293, + "learning_rate": 5.954982088414701e-06, + "loss": 1.3241, + "step": 3920 + }, + { + "epoch": 0.6401433399845258, + "grad_norm": 8.035179138183594, + "learning_rate": 5.9074612239131915e-06, + "loss": 1.2826, + "step": 3930 + }, + { + "epoch": 0.6417722034450463, + "grad_norm": 8.706337928771973, + "learning_rate": 5.8600511139445536e-06, + "loss": 1.1557, + "step": 3940 + }, + { + "epoch": 0.6434010669055666, + "grad_norm": 14.4612398147583, + "learning_rate": 5.81275304154723e-06, + "loss": 1.1653, + "step": 3950 + }, + { + "epoch": 0.645029930366087, + "grad_norm": 7.372416973114014, + "learning_rate": 5.765568286727646e-06, + "loss": 1.1933, + "step": 3960 + }, + { + "epoch": 0.6466587938266075, + "grad_norm": 8.995341300964355, + "learning_rate": 5.718498126425556e-06, + "loss": 1.1356, + "step": 3970 + }, + { + "epoch": 0.6482876572871279, + "grad_norm": 12.190003395080566, + "learning_rate": 5.671543834479503e-06, + "loss": 1.3165, + "step": 3980 + }, + { + "epoch": 0.6499165207476483, + "grad_norm": 6.376560211181641, + "learning_rate": 5.624706681592329e-06, + "loss": 1.2403, + "step": 3990 + }, + { + "epoch": 0.6515453842081688, + "grad_norm": 8.215283393859863, + "learning_rate": 5.5779879352968e-06, + "loss": 1.2896, + "step": 4000 + }, + { + "epoch": 0.6515453842081688, + "eval_loss": 1.1294902563095093, + "eval_runtime": 102.4055, + "eval_samples_per_second": 25.243, + "eval_steps_per_second": 25.243, + "step": 4000 + }, + { + "epoch": 0.6531742476686891, + "grad_norm": 7.186392307281494, + "learning_rate": 5.531388859921303e-06, + "loss": 1.2025, + "step": 4010 + }, + { + "epoch": 0.6548031111292096, + "grad_norm": 11.444113731384277, + "learning_rate": 5.484910716555607e-06, + "loss": 1.3191, + "step": 4020 + }, + { + "epoch": 0.65643197458973, + "grad_norm": 4.889548301696777, + "learning_rate": 5.438554763016775e-06, + "loss": 1.1232, + "step": 4030 + }, + { + "epoch": 0.6580608380502504, + "grad_norm": 12.841841697692871, + "learning_rate": 5.392322253815079e-06, + "loss": 1.2834, + "step": 4040 + }, + { + "epoch": 0.6596897015107709, + "grad_norm": 7.367560386657715, + "learning_rate": 5.3462144401200945e-06, + "loss": 1.3042, + "step": 4050 + }, + { + "epoch": 0.6613185649712913, + "grad_norm": 7.350122451782227, + "learning_rate": 5.300232569726805e-06, + "loss": 1.2925, + "step": 4060 + }, + { + "epoch": 0.6629474284318118, + "grad_norm": 6.530559539794922, + "learning_rate": 5.254377887021842e-06, + "loss": 1.2089, + "step": 4070 + }, + { + "epoch": 0.6645762918923321, + "grad_norm": 8.514747619628906, + "learning_rate": 5.20865163294983e-06, + "loss": 1.1635, + "step": 4080 + }, + { + "epoch": 0.6662051553528525, + "grad_norm": 10.842570304870605, + "learning_rate": 5.163055044979783e-06, + "loss": 1.1896, + "step": 4090 + }, + { + "epoch": 0.667834018813373, + "grad_norm": 10.257299423217773, + "learning_rate": 5.1175893570716075e-06, + "loss": 1.2857, + "step": 4100 + }, + { + "epoch": 0.6694628822738934, + "grad_norm": 7.264848709106445, + "learning_rate": 5.072255799642737e-06, + "loss": 1.2011, + "step": 4110 + }, + { + "epoch": 0.6710917457344138, + "grad_norm": 7.874101161956787, + "learning_rate": 5.027055599534802e-06, + "loss": 1.2601, + "step": 4120 + }, + { + "epoch": 0.6727206091949343, + "grad_norm": 9.67297077178955, + "learning_rate": 4.981989979980457e-06, + "loss": 1.2038, + "step": 4130 + }, + { + "epoch": 0.6743494726554546, + "grad_norm": 8.36242389678955, + "learning_rate": 4.93706016057026e-06, + "loss": 1.2007, + "step": 4140 + }, + { + "epoch": 0.6759783361159751, + "grad_norm": 8.81531810760498, + "learning_rate": 4.8922673572196625e-06, + "loss": 1.2002, + "step": 4150 + }, + { + "epoch": 0.6776071995764955, + "grad_norm": 13.88326358795166, + "learning_rate": 4.847612782136127e-06, + "loss": 1.2879, + "step": 4160 + }, + { + "epoch": 0.6792360630370159, + "grad_norm": 8.99789810180664, + "learning_rate": 4.803097643786289e-06, + "loss": 1.0767, + "step": 4170 + }, + { + "epoch": 0.6808649264975364, + "grad_norm": 9.136383056640625, + "learning_rate": 4.758723146863285e-06, + "loss": 1.2943, + "step": 4180 + }, + { + "epoch": 0.6824937899580568, + "grad_norm": 11.534010887145996, + "learning_rate": 4.714490492254134e-06, + "loss": 1.2505, + "step": 4190 + }, + { + "epoch": 0.6841226534185771, + "grad_norm": 7.77902889251709, + "learning_rate": 4.670400877007229e-06, + "loss": 1.3522, + "step": 4200 + }, + { + "epoch": 0.6857515168790976, + "grad_norm": 9.129291534423828, + "learning_rate": 4.6264554942999685e-06, + "loss": 1.2585, + "step": 4210 + }, + { + "epoch": 0.687380380339618, + "grad_norm": 9.164624214172363, + "learning_rate": 4.582655533406445e-06, + "loss": 1.2429, + "step": 4220 + }, + { + "epoch": 0.6890092438001385, + "grad_norm": 10.26278018951416, + "learning_rate": 4.539002179665256e-06, + "loss": 1.3034, + "step": 4230 + }, + { + "epoch": 0.6906381072606589, + "grad_norm": 6.888890743255615, + "learning_rate": 4.495496614447455e-06, + "loss": 1.2, + "step": 4240 + }, + { + "epoch": 0.6922669707211793, + "grad_norm": 6.9961090087890625, + "learning_rate": 4.452140015124539e-06, + "loss": 1.2924, + "step": 4250 + }, + { + "epoch": 0.6938958341816998, + "grad_norm": 9.418280601501465, + "learning_rate": 4.4089335550366275e-06, + "loss": 1.2678, + "step": 4260 + }, + { + "epoch": 0.6955246976422201, + "grad_norm": 9.817927360534668, + "learning_rate": 4.365878403460687e-06, + "loss": 1.1866, + "step": 4270 + }, + { + "epoch": 0.6971535611027405, + "grad_norm": 7.495052337646484, + "learning_rate": 4.322975725578871e-06, + "loss": 1.2452, + "step": 4280 + }, + { + "epoch": 0.698782424563261, + "grad_norm": 7.096460342407227, + "learning_rate": 4.280226682447026e-06, + "loss": 1.2315, + "step": 4290 + }, + { + "epoch": 0.7004112880237814, + "grad_norm": 8.440934181213379, + "learning_rate": 4.23763243096325e-06, + "loss": 1.1404, + "step": 4300 + }, + { + "epoch": 0.7020401514843019, + "grad_norm": 10.936495780944824, + "learning_rate": 4.195194123836569e-06, + "loss": 1.303, + "step": 4310 + }, + { + "epoch": 0.7036690149448223, + "grad_norm": 10.077718734741211, + "learning_rate": 4.152912909555775e-06, + "loss": 1.2253, + "step": 4320 + }, + { + "epoch": 0.7052978784053426, + "grad_norm": 10.520235061645508, + "learning_rate": 4.110789932358312e-06, + "loss": 1.1638, + "step": 4330 + }, + { + "epoch": 0.7069267418658631, + "grad_norm": 8.664091110229492, + "learning_rate": 4.068826332199336e-06, + "loss": 1.2585, + "step": 4340 + }, + { + "epoch": 0.7085556053263835, + "grad_norm": 9.869935989379883, + "learning_rate": 4.027023244720853e-06, + "loss": 1.2998, + "step": 4350 + }, + { + "epoch": 0.7101844687869039, + "grad_norm": 7.815759658813477, + "learning_rate": 3.985381801220975e-06, + "loss": 1.1673, + "step": 4360 + }, + { + "epoch": 0.7118133322474244, + "grad_norm": 9.876256942749023, + "learning_rate": 3.943903128623336e-06, + "loss": 1.1876, + "step": 4370 + }, + { + "epoch": 0.7134421957079448, + "grad_norm": 10.636608123779297, + "learning_rate": 3.902588349446551e-06, + "loss": 1.2142, + "step": 4380 + }, + { + "epoch": 0.7150710591684653, + "grad_norm": 8.32884693145752, + "learning_rate": 3.86143858177388e-06, + "loss": 1.3103, + "step": 4390 + }, + { + "epoch": 0.7166999226289856, + "grad_norm": 13.757806777954102, + "learning_rate": 3.820454939222946e-06, + "loss": 1.1283, + "step": 4400 + }, + { + "epoch": 0.718328786089506, + "grad_norm": 19.816259384155273, + "learning_rate": 3.7796385309155948e-06, + "loss": 1.2626, + "step": 4410 + }, + { + "epoch": 0.7199576495500265, + "grad_norm": 8.810699462890625, + "learning_rate": 3.7389904614479e-06, + "loss": 1.3254, + "step": 4420 + }, + { + "epoch": 0.7215865130105469, + "grad_norm": 7.040134429931641, + "learning_rate": 3.698511830860243e-06, + "loss": 1.3012, + "step": 4430 + }, + { + "epoch": 0.7232153764710673, + "grad_norm": 8.462788581848145, + "learning_rate": 3.658203734607567e-06, + "loss": 1.1578, + "step": 4440 + }, + { + "epoch": 0.7248442399315878, + "grad_norm": 8.182144165039062, + "learning_rate": 3.6180672635297243e-06, + "loss": 1.3664, + "step": 4450 + }, + { + "epoch": 0.7264731033921081, + "grad_norm": 11.486858367919922, + "learning_rate": 3.578103503821939e-06, + "loss": 1.192, + "step": 4460 + }, + { + "epoch": 0.7281019668526286, + "grad_norm": 8.237690925598145, + "learning_rate": 3.53831353700544e-06, + "loss": 1.1809, + "step": 4470 + }, + { + "epoch": 0.729730830313149, + "grad_norm": 6.926926612854004, + "learning_rate": 3.4986984398981662e-06, + "loss": 1.1828, + "step": 4480 + }, + { + "epoch": 0.7313596937736694, + "grad_norm": 8.473318099975586, + "learning_rate": 3.4592592845856388e-06, + "loss": 1.3035, + "step": 4490 + }, + { + "epoch": 0.7329885572341899, + "grad_norm": 9.865799903869629, + "learning_rate": 3.4199971383919538e-06, + "loss": 1.2081, + "step": 4500 + }, + { + "epoch": 0.7329885572341899, + "eval_loss": 1.1236326694488525, + "eval_runtime": 102.8937, + "eval_samples_per_second": 25.123, + "eval_steps_per_second": 25.123, + "step": 4500 + }, + { + "epoch": 0.7346174206947103, + "grad_norm": 9.886106491088867, + "learning_rate": 3.380913063850877e-06, + "loss": 1.2866, + "step": 4510 + }, + { + "epoch": 0.7362462841552306, + "grad_norm": 7.137485504150391, + "learning_rate": 3.342008118677108e-06, + "loss": 1.0974, + "step": 4520 + }, + { + "epoch": 0.7378751476157511, + "grad_norm": 9.091876029968262, + "learning_rate": 3.303283355737653e-06, + "loss": 1.2417, + "step": 4530 + }, + { + "epoch": 0.7395040110762715, + "grad_norm": 7.459966659545898, + "learning_rate": 3.2647398230233175e-06, + "loss": 1.2105, + "step": 4540 + }, + { + "epoch": 0.741132874536792, + "grad_norm": 7.54026460647583, + "learning_rate": 3.2263785636203635e-06, + "loss": 1.1231, + "step": 4550 + }, + { + "epoch": 0.7427617379973124, + "grad_norm": 9.739063262939453, + "learning_rate": 3.188200615682265e-06, + "loss": 1.1882, + "step": 4560 + }, + { + "epoch": 0.7443906014578328, + "grad_norm": 11.615285873413086, + "learning_rate": 3.150207012401629e-06, + "loss": 1.1598, + "step": 4570 + }, + { + "epoch": 0.7460194649183532, + "grad_norm": 7.322878837585449, + "learning_rate": 3.1123987819822234e-06, + "loss": 1.198, + "step": 4580 + }, + { + "epoch": 0.7476483283788736, + "grad_norm": 7.064319133758545, + "learning_rate": 3.0747769476111454e-06, + "loss": 1.1921, + "step": 4590 + }, + { + "epoch": 0.749277191839394, + "grad_norm": 9.456534385681152, + "learning_rate": 3.037342527431152e-06, + "loss": 1.263, + "step": 4600 + }, + { + "epoch": 0.7509060552999145, + "grad_norm": 7.1396098136901855, + "learning_rate": 3.0000965345130904e-06, + "loss": 1.2136, + "step": 4610 + }, + { + "epoch": 0.7525349187604349, + "grad_norm": 7.965095520019531, + "learning_rate": 2.96303997682848e-06, + "loss": 1.2013, + "step": 4620 + }, + { + "epoch": 0.7541637822209554, + "grad_norm": 6.984764575958252, + "learning_rate": 2.9261738572222487e-06, + "loss": 1.2054, + "step": 4630 + }, + { + "epoch": 0.7557926456814757, + "grad_norm": 9.820785522460938, + "learning_rate": 2.889499173385576e-06, + "loss": 1.2689, + "step": 4640 + }, + { + "epoch": 0.7574215091419961, + "grad_norm": 12.76016616821289, + "learning_rate": 2.8530169178289068e-06, + "loss": 1.2673, + "step": 4650 + }, + { + "epoch": 0.7590503726025166, + "grad_norm": 6.978170871734619, + "learning_rate": 2.8167280778550897e-06, + "loss": 1.3873, + "step": 4660 + }, + { + "epoch": 0.760679236063037, + "grad_norm": 5.620461463928223, + "learning_rate": 2.7806336355326434e-06, + "loss": 1.1573, + "step": 4670 + }, + { + "epoch": 0.7623080995235575, + "grad_norm": 6.476634979248047, + "learning_rate": 2.744734567669203e-06, + "loss": 1.1334, + "step": 4680 + }, + { + "epoch": 0.7639369629840779, + "grad_norm": 6.739288330078125, + "learning_rate": 2.709031845785062e-06, + "loss": 1.2286, + "step": 4690 + }, + { + "epoch": 0.7655658264445983, + "grad_norm": 8.93143367767334, + "learning_rate": 2.673526436086894e-06, + "loss": 1.2131, + "step": 4700 + }, + { + "epoch": 0.7671946899051187, + "grad_norm": 17.24730110168457, + "learning_rate": 2.63821929944161e-06, + "loss": 1.2658, + "step": 4710 + }, + { + "epoch": 0.7688235533656391, + "grad_norm": 11.340949058532715, + "learning_rate": 2.6031113913503337e-06, + "loss": 1.2794, + "step": 4720 + }, + { + "epoch": 0.7704524168261595, + "grad_norm": 10.817314147949219, + "learning_rate": 2.5682036619225657e-06, + "loss": 1.3443, + "step": 4730 + }, + { + "epoch": 0.77208128028668, + "grad_norm": 17.411333084106445, + "learning_rate": 2.5334970558504613e-06, + "loss": 1.1532, + "step": 4740 + }, + { + "epoch": 0.7737101437472004, + "grad_norm": 9.042006492614746, + "learning_rate": 2.4989925123832583e-06, + "loss": 1.2526, + "step": 4750 + }, + { + "epoch": 0.7753390072077209, + "grad_norm": 9.995912551879883, + "learning_rate": 2.4646909653018724e-06, + "loss": 1.0986, + "step": 4760 + }, + { + "epoch": 0.7769678706682412, + "grad_norm": 11.739777565002441, + "learning_rate": 2.4305933428936137e-06, + "loss": 1.2693, + "step": 4770 + }, + { + "epoch": 0.7785967341287616, + "grad_norm": 11.440888404846191, + "learning_rate": 2.3967005679270736e-06, + "loss": 1.2691, + "step": 4780 + }, + { + "epoch": 0.7802255975892821, + "grad_norm": 9.173127174377441, + "learning_rate": 2.3630135576271563e-06, + "loss": 1.137, + "step": 4790 + }, + { + "epoch": 0.7818544610498025, + "grad_norm": 11.49284839630127, + "learning_rate": 2.329533223650233e-06, + "loss": 1.2192, + "step": 4800 + }, + { + "epoch": 0.7834833245103229, + "grad_norm": 6.268970012664795, + "learning_rate": 2.296260472059505e-06, + "loss": 1.1762, + "step": 4810 + }, + { + "epoch": 0.7851121879708434, + "grad_norm": 7.918376445770264, + "learning_rate": 2.2631962033004486e-06, + "loss": 1.2459, + "step": 4820 + }, + { + "epoch": 0.7867410514313637, + "grad_norm": 9.789480209350586, + "learning_rate": 2.230341312176476e-06, + "loss": 1.2437, + "step": 4830 + }, + { + "epoch": 0.7883699148918842, + "grad_norm": 6.269535064697266, + "learning_rate": 2.197696687824703e-06, + "loss": 1.2836, + "step": 4840 + }, + { + "epoch": 0.7899987783524046, + "grad_norm": 8.587770462036133, + "learning_rate": 2.165263213691885e-06, + "loss": 1.3312, + "step": 4850 + }, + { + "epoch": 0.791627641812925, + "grad_norm": 7.60506010055542, + "learning_rate": 2.133041767510523e-06, + "loss": 1.285, + "step": 4860 + }, + { + "epoch": 0.7932565052734455, + "grad_norm": 8.978716850280762, + "learning_rate": 2.1010332212750926e-06, + "loss": 1.2393, + "step": 4870 + }, + { + "epoch": 0.7948853687339659, + "grad_norm": 8.022439956665039, + "learning_rate": 2.0692384412184587e-06, + "loss": 1.1718, + "step": 4880 + }, + { + "epoch": 0.7965142321944862, + "grad_norm": 11.286605834960938, + "learning_rate": 2.0376582877884322e-06, + "loss": 1.1298, + "step": 4890 + }, + { + "epoch": 0.7981430956550067, + "grad_norm": 8.459285736083984, + "learning_rate": 2.0062936156244695e-06, + "loss": 1.3101, + "step": 4900 + }, + { + "epoch": 0.7997719591155271, + "grad_norm": 8.629647254943848, + "learning_rate": 1.9751452735345677e-06, + "loss": 1.3471, + "step": 4910 + }, + { + "epoch": 0.8014008225760476, + "grad_norm": 9.93689250946045, + "learning_rate": 1.9442141044722694e-06, + "loss": 1.2816, + "step": 4920 + }, + { + "epoch": 0.803029686036568, + "grad_norm": 13.70149040222168, + "learning_rate": 1.9135009455138643e-06, + "loss": 1.1864, + "step": 4930 + }, + { + "epoch": 0.8046585494970884, + "grad_norm": 11.720361709594727, + "learning_rate": 1.8830066278357395e-06, + "loss": 1.2885, + "step": 4940 + }, + { + "epoch": 0.8062874129576089, + "grad_norm": 9.665403366088867, + "learning_rate": 1.8527319766918694e-06, + "loss": 1.2245, + "step": 4950 + }, + { + "epoch": 0.8079162764181292, + "grad_norm": 8.559013366699219, + "learning_rate": 1.8226778113914989e-06, + "loss": 1.1385, + "step": 4960 + }, + { + "epoch": 0.8095451398786496, + "grad_norm": 9.008288383483887, + "learning_rate": 1.7928449452769636e-06, + "loss": 1.1462, + "step": 4970 + }, + { + "epoch": 0.8111740033391701, + "grad_norm": 7.702124118804932, + "learning_rate": 1.7632341857016733e-06, + "loss": 1.2371, + "step": 4980 + }, + { + "epoch": 0.8128028667996905, + "grad_norm": 7.910833835601807, + "learning_rate": 1.7338463340082734e-06, + "loss": 1.2431, + "step": 4990 + }, + { + "epoch": 0.814431730260211, + "grad_norm": 10.374570846557617, + "learning_rate": 1.7046821855069562e-06, + "loss": 1.2451, + "step": 5000 + }, + { + "epoch": 0.814431730260211, + "eval_loss": 1.1212321519851685, + "eval_runtime": 102.729, + "eval_samples_per_second": 25.163, + "eval_steps_per_second": 25.163, + "step": 5000 + }, + { + "epoch": 0.8160605937207314, + "grad_norm": 12.469367027282715, + "learning_rate": 1.6757425294539266e-06, + "loss": 1.3257, + "step": 5010 + }, + { + "epoch": 0.8176894571812517, + "grad_norm": 8.322937965393066, + "learning_rate": 1.647028149030061e-06, + "loss": 1.2494, + "step": 5020 + }, + { + "epoch": 0.8193183206417722, + "grad_norm": 10.362074851989746, + "learning_rate": 1.6185398213196935e-06, + "loss": 1.1441, + "step": 5030 + }, + { + "epoch": 0.8209471841022926, + "grad_norm": 10.657358169555664, + "learning_rate": 1.5902783172896042e-06, + "loss": 1.2038, + "step": 5040 + }, + { + "epoch": 0.822576047562813, + "grad_norm": 6.7474517822265625, + "learning_rate": 1.5622444017681438e-06, + "loss": 1.1308, + "step": 5050 + }, + { + "epoch": 0.8242049110233335, + "grad_norm": 10.932724952697754, + "learning_rate": 1.534438833424533e-06, + "loss": 1.1732, + "step": 5060 + }, + { + "epoch": 0.8258337744838539, + "grad_norm": 7.627405643463135, + "learning_rate": 1.5068623647483428e-06, + "loss": 1.1552, + "step": 5070 + }, + { + "epoch": 0.8274626379443744, + "grad_norm": 13.693827629089355, + "learning_rate": 1.479515742029115e-06, + "loss": 1.1808, + "step": 5080 + }, + { + "epoch": 0.8290915014048947, + "grad_norm": 7.652097702026367, + "learning_rate": 1.4523997053361805e-06, + "loss": 1.2478, + "step": 5090 + }, + { + "epoch": 0.8307203648654151, + "grad_norm": 6.031581401824951, + "learning_rate": 1.4255149884986253e-06, + "loss": 1.1577, + "step": 5100 + }, + { + "epoch": 0.8323492283259356, + "grad_norm": 7.7093729972839355, + "learning_rate": 1.3988623190854233e-06, + "loss": 1.1844, + "step": 5110 + }, + { + "epoch": 0.833978091786456, + "grad_norm": 9.999074935913086, + "learning_rate": 1.3724424183857599e-06, + "loss": 1.1921, + "step": 5120 + }, + { + "epoch": 0.8356069552469764, + "grad_norm": 6.947312355041504, + "learning_rate": 1.3462560013895031e-06, + "loss": 1.273, + "step": 5130 + }, + { + "epoch": 0.8372358187074969, + "grad_norm": 9.171476364135742, + "learning_rate": 1.320303776767855e-06, + "loss": 1.1454, + "step": 5140 + }, + { + "epoch": 0.8388646821680172, + "grad_norm": 7.364622592926025, + "learning_rate": 1.2945864468541792e-06, + "loss": 1.2112, + "step": 5150 + }, + { + "epoch": 0.8404935456285377, + "grad_norm": 6.328592777252197, + "learning_rate": 1.2691047076249852e-06, + "loss": 1.2982, + "step": 5160 + }, + { + "epoch": 0.8421224090890581, + "grad_norm": 11.03521728515625, + "learning_rate": 1.2438592486811007e-06, + "loss": 1.2062, + "step": 5170 + }, + { + "epoch": 0.8437512725495785, + "grad_norm": 7.168650150299072, + "learning_rate": 1.2188507532290094e-06, + "loss": 1.2764, + "step": 5180 + }, + { + "epoch": 0.845380136010099, + "grad_norm": 7.8710784912109375, + "learning_rate": 1.194079898062349e-06, + "loss": 1.1388, + "step": 5190 + }, + { + "epoch": 0.8470089994706194, + "grad_norm": 9.251112937927246, + "learning_rate": 1.1695473535436187e-06, + "loss": 1.1258, + "step": 5200 + }, + { + "epoch": 0.8486378629311399, + "grad_norm": 10.411331176757812, + "learning_rate": 1.145253783586011e-06, + "loss": 1.2281, + "step": 5210 + }, + { + "epoch": 0.8502667263916602, + "grad_norm": 13.52040958404541, + "learning_rate": 1.1211998456354656e-06, + "loss": 1.2075, + "step": 5220 + }, + { + "epoch": 0.8518955898521806, + "grad_norm": 8.210290908813477, + "learning_rate": 1.0973861906528692e-06, + "loss": 1.3148, + "step": 5230 + }, + { + "epoch": 0.8535244533127011, + "grad_norm": 7.864938259124756, + "learning_rate": 1.0738134630964326e-06, + "loss": 1.3147, + "step": 5240 + }, + { + "epoch": 0.8551533167732215, + "grad_norm": 8.113484382629395, + "learning_rate": 1.050482300904264e-06, + "loss": 1.2167, + "step": 5250 + }, + { + "epoch": 0.8567821802337419, + "grad_norm": 10.562095642089844, + "learning_rate": 1.0273933354770894e-06, + "loss": 1.3086, + "step": 5260 + }, + { + "epoch": 0.8584110436942624, + "grad_norm": 10.281449317932129, + "learning_rate": 1.004547191661178e-06, + "loss": 1.21, + "step": 5270 + }, + { + "epoch": 0.8600399071547827, + "grad_norm": 9.31655216217041, + "learning_rate": 9.819444877314299e-07, + "loss": 1.3514, + "step": 5280 + }, + { + "epoch": 0.8616687706153032, + "grad_norm": 8.388626098632812, + "learning_rate": 9.5958583537463e-07, + "loss": 1.2307, + "step": 5290 + }, + { + "epoch": 0.8632976340758236, + "grad_norm": 8.135517120361328, + "learning_rate": 9.374718396729188e-07, + "loss": 1.2003, + "step": 5300 + }, + { + "epoch": 0.864926497536344, + "grad_norm": 7.857133388519287, + "learning_rate": 9.156030990873932e-07, + "loss": 1.1086, + "step": 5310 + }, + { + "epoch": 0.8665553609968645, + "grad_norm": 10.863335609436035, + "learning_rate": 8.939802054419289e-07, + "loss": 1.2312, + "step": 5320 + }, + { + "epoch": 0.8681842244573849, + "grad_norm": 8.662337303161621, + "learning_rate": 8.726037439071555e-07, + "loss": 1.2288, + "step": 5330 + }, + { + "epoch": 0.8698130879179052, + "grad_norm": 8.542695045471191, + "learning_rate": 8.514742929846142e-07, + "loss": 1.1772, + "step": 5340 + }, + { + "epoch": 0.8714419513784257, + "grad_norm": 8.269294738769531, + "learning_rate": 8.305924244911178e-07, + "loss": 1.1143, + "step": 5350 + }, + { + "epoch": 0.8730708148389461, + "grad_norm": 7.3109354972839355, + "learning_rate": 8.099587035432654e-07, + "loss": 1.128, + "step": 5360 + }, + { + "epoch": 0.8746996782994666, + "grad_norm": 7.124019622802734, + "learning_rate": 7.895736885421468e-07, + "loss": 1.3185, + "step": 5370 + }, + { + "epoch": 0.876328541759987, + "grad_norm": 10.167950630187988, + "learning_rate": 7.694379311582401e-07, + "loss": 1.2222, + "step": 5380 + }, + { + "epoch": 0.8779574052205074, + "grad_norm": 9.3758544921875, + "learning_rate": 7.49551976316475e-07, + "loss": 1.23, + "step": 5390 + }, + { + "epoch": 0.8795862686810278, + "grad_norm": 10.248433113098145, + "learning_rate": 7.299163621814853e-07, + "loss": 1.2133, + "step": 5400 + }, + { + "epoch": 0.8812151321415482, + "grad_norm": 13.722402572631836, + "learning_rate": 7.105316201430512e-07, + "loss": 1.2697, + "step": 5410 + }, + { + "epoch": 0.8828439956020686, + "grad_norm": 8.881364822387695, + "learning_rate": 6.91398274801709e-07, + "loss": 1.18, + "step": 5420 + }, + { + "epoch": 0.8844728590625891, + "grad_norm": 6.859842300415039, + "learning_rate": 6.725168439545637e-07, + "loss": 1.1595, + "step": 5430 + }, + { + "epoch": 0.8861017225231095, + "grad_norm": 8.630757331848145, + "learning_rate": 6.53887838581273e-07, + "loss": 1.2345, + "step": 5440 + }, + { + "epoch": 0.88773058598363, + "grad_norm": 11.166703224182129, + "learning_rate": 6.355117628302121e-07, + "loss": 1.2953, + "step": 5450 + }, + { + "epoch": 0.8893594494441504, + "grad_norm": 7.578332901000977, + "learning_rate": 6.173891140048427e-07, + "loss": 1.1959, + "step": 5460 + }, + { + "epoch": 0.8909883129046707, + "grad_norm": 18.070478439331055, + "learning_rate": 5.995203825502393e-07, + "loss": 1.2072, + "step": 5470 + }, + { + "epoch": 0.8926171763651912, + "grad_norm": 9.904817581176758, + "learning_rate": 5.819060520398345e-07, + "loss": 1.2266, + "step": 5480 + }, + { + "epoch": 0.8942460398257116, + "grad_norm": 8.371089935302734, + "learning_rate": 5.645465991623167e-07, + "loss": 1.2003, + "step": 5490 + }, + { + "epoch": 0.895874903286232, + "grad_norm": 9.28614616394043, + "learning_rate": 5.474424937087353e-07, + "loss": 1.2134, + "step": 5500 + }, + { + "epoch": 0.895874903286232, + "eval_loss": 1.1204986572265625, + "eval_runtime": 102.9981, + "eval_samples_per_second": 25.098, + "eval_steps_per_second": 25.098, + "step": 5500 + }, + { + "epoch": 0.8975037667467525, + "grad_norm": 8.397842407226562, + "learning_rate": 5.305941985597929e-07, + "loss": 1.1799, + "step": 5510 + }, + { + "epoch": 0.8991326302072729, + "grad_norm": 5.709924697875977, + "learning_rate": 5.140021696733066e-07, + "loss": 1.233, + "step": 5520 + }, + { + "epoch": 0.9007614936677933, + "grad_norm": 8.849600791931152, + "learning_rate": 4.97666856071879e-07, + "loss": 1.3359, + "step": 5530 + }, + { + "epoch": 0.9023903571283137, + "grad_norm": 8.294559478759766, + "learning_rate": 4.815886998307439e-07, + "loss": 1.2746, + "step": 5540 + }, + { + "epoch": 0.9040192205888341, + "grad_norm": 8.917488098144531, + "learning_rate": 4.657681360657962e-07, + "loss": 1.1755, + "step": 5550 + }, + { + "epoch": 0.9056480840493546, + "grad_norm": 6.684938907623291, + "learning_rate": 4.502055929218241e-07, + "loss": 1.2818, + "step": 5560 + }, + { + "epoch": 0.907276947509875, + "grad_norm": 6.304566383361816, + "learning_rate": 4.34901491560924e-07, + "loss": 1.3162, + "step": 5570 + }, + { + "epoch": 0.9089058109703954, + "grad_norm": 16.449485778808594, + "learning_rate": 4.1985624615109134e-07, + "loss": 1.0536, + "step": 5580 + }, + { + "epoch": 0.9105346744309158, + "grad_norm": 9.711615562438965, + "learning_rate": 4.0507026385502747e-07, + "loss": 1.1989, + "step": 5590 + }, + { + "epoch": 0.9121635378914362, + "grad_norm": 9.28388786315918, + "learning_rate": 3.9054394481910507e-07, + "loss": 1.3322, + "step": 5600 + }, + { + "epoch": 0.9137924013519567, + "grad_norm": 7.323754787445068, + "learning_rate": 3.7627768216255244e-07, + "loss": 1.1938, + "step": 5610 + }, + { + "epoch": 0.9154212648124771, + "grad_norm": 7.1576385498046875, + "learning_rate": 3.6227186196680976e-07, + "loss": 1.2738, + "step": 5620 + }, + { + "epoch": 0.9170501282729975, + "grad_norm": 8.460196495056152, + "learning_rate": 3.485268632650751e-07, + "loss": 1.2285, + "step": 5630 + }, + { + "epoch": 0.918678991733518, + "grad_norm": 8.773188591003418, + "learning_rate": 3.350430580320574e-07, + "loss": 1.2892, + "step": 5640 + }, + { + "epoch": 0.9203078551940383, + "grad_norm": 7.650882720947266, + "learning_rate": 3.218208111738996e-07, + "loss": 1.2367, + "step": 5650 + }, + { + "epoch": 0.9219367186545587, + "grad_norm": 12.023070335388184, + "learning_rate": 3.088604805183126e-07, + "loss": 1.2383, + "step": 5660 + }, + { + "epoch": 0.9235655821150792, + "grad_norm": 10.934995651245117, + "learning_rate": 2.9616241680488713e-07, + "loss": 1.2227, + "step": 5670 + }, + { + "epoch": 0.9251944455755996, + "grad_norm": 8.503402709960938, + "learning_rate": 2.837269636755946e-07, + "loss": 1.3065, + "step": 5680 + }, + { + "epoch": 0.9268233090361201, + "grad_norm": 8.271093368530273, + "learning_rate": 2.7155445766550605e-07, + "loss": 1.0851, + "step": 5690 + }, + { + "epoch": 0.9284521724966405, + "grad_norm": 8.876206398010254, + "learning_rate": 2.5964522819366125e-07, + "loss": 1.2316, + "step": 5700 + }, + { + "epoch": 0.9300810359571609, + "grad_norm": 11.97021484375, + "learning_rate": 2.479995975541749e-07, + "loss": 1.2201, + "step": 5710 + }, + { + "epoch": 0.9317098994176813, + "grad_norm": 9.127492904663086, + "learning_rate": 2.3661788090750038e-07, + "loss": 1.2073, + "step": 5720 + }, + { + "epoch": 0.9333387628782017, + "grad_norm": 7.851465702056885, + "learning_rate": 2.255003862719074e-07, + "loss": 1.2395, + "step": 5730 + }, + { + "epoch": 0.9349676263387221, + "grad_norm": 11.95673942565918, + "learning_rate": 2.1464741451514447e-07, + "loss": 1.2248, + "step": 5740 + }, + { + "epoch": 0.9365964897992426, + "grad_norm": 7.73248291015625, + "learning_rate": 2.0405925934629423e-07, + "loss": 1.3249, + "step": 5750 + }, + { + "epoch": 0.938225353259763, + "grad_norm": 10.03739070892334, + "learning_rate": 1.9373620730783082e-07, + "loss": 1.1815, + "step": 5760 + }, + { + "epoch": 0.9398542167202835, + "grad_norm": 8.619077682495117, + "learning_rate": 1.836785377678596e-07, + "loss": 1.244, + "step": 5770 + }, + { + "epoch": 0.9414830801808038, + "grad_norm": 8.094244956970215, + "learning_rate": 1.738865229125597e-07, + "loss": 1.2881, + "step": 5780 + }, + { + "epoch": 0.9431119436413242, + "grad_norm": 9.411540031433105, + "learning_rate": 1.6436042773881666e-07, + "loss": 1.2496, + "step": 5790 + }, + { + "epoch": 0.9447408071018447, + "grad_norm": 9.164313316345215, + "learning_rate": 1.5510051004705263e-07, + "loss": 1.2901, + "step": 5800 + }, + { + "epoch": 0.9463696705623651, + "grad_norm": 10.367000579833984, + "learning_rate": 1.4610702043424628e-07, + "loss": 1.1452, + "step": 5810 + }, + { + "epoch": 0.9479985340228856, + "grad_norm": 8.201516151428223, + "learning_rate": 1.373802022871551e-07, + "loss": 1.2266, + "step": 5820 + }, + { + "epoch": 0.949627397483406, + "grad_norm": 8.908825874328613, + "learning_rate": 1.2892029177572817e-07, + "loss": 1.2799, + "step": 5830 + }, + { + "epoch": 0.9512562609439263, + "grad_norm": 7.841489315032959, + "learning_rate": 1.2072751784671043e-07, + "loss": 1.2983, + "step": 5840 + }, + { + "epoch": 0.9528851244044468, + "grad_norm": 9.294130325317383, + "learning_rate": 1.1280210221745192e-07, + "loss": 1.2792, + "step": 5850 + }, + { + "epoch": 0.9545139878649672, + "grad_norm": 7.930861473083496, + "learning_rate": 1.0514425936990369e-07, + "loss": 1.1413, + "step": 5860 + }, + { + "epoch": 0.9561428513254876, + "grad_norm": 9.744686126708984, + "learning_rate": 9.775419654481588e-08, + "loss": 1.1266, + "step": 5870 + }, + { + "epoch": 0.9577717147860081, + "grad_norm": 7.7475905418396, + "learning_rate": 9.063211373613102e-08, + "loss": 1.4206, + "step": 5880 + }, + { + "epoch": 0.9594005782465285, + "grad_norm": 5.828322887420654, + "learning_rate": 8.3778203685565e-08, + "loss": 1.2971, + "step": 5890 + }, + { + "epoch": 0.961029441707049, + "grad_norm": 10.001614570617676, + "learning_rate": 7.71926518773991e-08, + "loss": 1.2998, + "step": 5900 + }, + { + "epoch": 0.9626583051675693, + "grad_norm": 9.513165473937988, + "learning_rate": 7.087563653345286e-08, + "loss": 1.1007, + "step": 5910 + }, + { + "epoch": 0.9642871686280897, + "grad_norm": 9.425087928771973, + "learning_rate": 6.482732860826679e-08, + "loss": 1.245, + "step": 5920 + }, + { + "epoch": 0.9659160320886102, + "grad_norm": 8.666535377502441, + "learning_rate": 5.90478917844739e-08, + "loss": 1.2323, + "step": 5930 + }, + { + "epoch": 0.9675448955491306, + "grad_norm": 8.544173240661621, + "learning_rate": 5.3537482468366544e-08, + "loss": 1.2152, + "step": 5940 + }, + { + "epoch": 0.969173759009651, + "grad_norm": 9.873296737670898, + "learning_rate": 4.829624978567204e-08, + "loss": 1.216, + "step": 5950 + }, + { + "epoch": 0.9708026224701715, + "grad_norm": 6.528816223144531, + "learning_rate": 4.332433557750704e-08, + "loss": 1.36, + "step": 5960 + }, + { + "epoch": 0.9724314859306918, + "grad_norm": 13.494250297546387, + "learning_rate": 3.862187439654608e-08, + "loss": 1.1844, + "step": 5970 + }, + { + "epoch": 0.9740603493912123, + "grad_norm": 10.320327758789062, + "learning_rate": 3.41889935033779e-08, + "loss": 1.3135, + "step": 5980 + }, + { + "epoch": 0.9756892128517327, + "grad_norm": 8.843477249145508, + "learning_rate": 3.002581286305817e-08, + "loss": 1.2222, + "step": 5990 + }, + { + "epoch": 0.9773180763122531, + "grad_norm": 7.343149662017822, + "learning_rate": 2.6132445141872074e-08, + "loss": 1.2437, + "step": 6000 + }, + { + "epoch": 0.9773180763122531, + "eval_loss": 1.1204884052276611, + "eval_runtime": 102.4877, + "eval_samples_per_second": 25.223, + "eval_steps_per_second": 25.223, + "step": 6000 + }, + { + "epoch": 0.9789469397727736, + "grad_norm": 10.39957332611084, + "learning_rate": 2.250899570427345e-08, + "loss": 1.174, + "step": 6010 + }, + { + "epoch": 0.980575803233294, + "grad_norm": 7.477624416351318, + "learning_rate": 1.91555626100437e-08, + "loss": 1.3035, + "step": 6020 + }, + { + "epoch": 0.9822046666938143, + "grad_norm": 7.673905849456787, + "learning_rate": 1.6072236611629487e-08, + "loss": 1.2888, + "step": 6030 + }, + { + "epoch": 0.9838335301543348, + "grad_norm": 8.837902069091797, + "learning_rate": 1.325910115169471e-08, + "loss": 1.3579, + "step": 6040 + }, + { + "epoch": 0.9854623936148552, + "grad_norm": 9.265447616577148, + "learning_rate": 1.0716232360856726e-08, + "loss": 1.2222, + "step": 6050 + }, + { + "epoch": 0.9870912570753757, + "grad_norm": 11.45450210571289, + "learning_rate": 8.44369905562692e-09, + "loss": 1.2786, + "step": 6060 + }, + { + "epoch": 0.9887201205358961, + "grad_norm": 9.006537437438965, + "learning_rate": 6.441562736551054e-09, + "loss": 1.2923, + "step": 6070 + }, + { + "epoch": 0.9903489839964165, + "grad_norm": 7.819212913513184, + "learning_rate": 4.709877586540623e-09, + "loss": 1.1388, + "step": 6080 + }, + { + "epoch": 0.991977847456937, + "grad_norm": 10.596216201782227, + "learning_rate": 3.2486904694128963e-09, + "loss": 1.1399, + "step": 6090 + }, + { + "epoch": 0.9936067109174573, + "grad_norm": 9.980060577392578, + "learning_rate": 2.0580409286152792e-09, + "loss": 1.1898, + "step": 6100 + }, + { + "epoch": 0.9952355743779777, + "grad_norm": 7.423088073730469, + "learning_rate": 1.1379611861594974e-09, + "loss": 1.234, + "step": 6110 + }, + { + "epoch": 0.9968644378384982, + "grad_norm": 8.370770454406738, + "learning_rate": 4.884761417489614e-10, + "loss": 1.2935, + "step": 6120 + }, + { + "epoch": 0.9984933012990186, + "grad_norm": 8.747970581054688, + "learning_rate": 1.0960337210597083e-10, + "loss": 1.2299, + "step": 6130 + } + ], + "logging_steps": 10, + "max_steps": 6139, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.1514993505940275e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}