diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -3,3095 +3,4278 @@ "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 20, - "global_step": 8786, + "global_step": 12178, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0022765430693491933, - "grad_norm": 0.469247430562973, + "epoch": 0.0016424069473813874, + "grad_norm": 0.5388180017471313, "learning_rate": 0.0002, - "loss": 1.9469, + "loss": 1.8932, "step": 20 }, { - "epoch": 0.004553086138698387, - "grad_norm": 0.6239348649978638, + "epoch": 0.003284813894762775, + "grad_norm": 0.46543794870376587, "learning_rate": 0.0002, - "loss": 1.556, + "loss": 1.6701, "step": 40 }, { - "epoch": 0.006829629208047579, - "grad_norm": 0.4587397277355194, + "epoch": 0.004927220842144162, + "grad_norm": 0.45620647072792053, "learning_rate": 0.0002, - "loss": 1.4108, + "loss": 1.5541, "step": 60 }, { - "epoch": 0.009106172277396773, - "grad_norm": 0.42919760942459106, + "epoch": 0.00656962778952555, + "grad_norm": 0.4583057761192322, "learning_rate": 0.0002, - "loss": 1.3352, + "loss": 1.5777, "step": 80 }, { - "epoch": 0.011382715346745967, - "grad_norm": 0.46492573618888855, + "epoch": 0.008212034736906937, + "grad_norm": 0.5295430421829224, "learning_rate": 0.0002, - "loss": 1.3388, + "loss": 1.3046, "step": 100 }, { - "epoch": 0.013659258416095159, - "grad_norm": 0.453070729970932, + "epoch": 0.009854441684288324, + "grad_norm": 0.44552722573280334, "learning_rate": 0.0002, - "loss": 1.2295, + "loss": 1.3053, "step": 120 }, { - "epoch": 0.015935801485444354, - "grad_norm": 0.4760678708553314, + "epoch": 0.011496848631669712, + "grad_norm": 0.45540332794189453, "learning_rate": 0.0002, - "loss": 1.2493, + "loss": 1.1971, "step": 140 }, { - "epoch": 0.018212344554793546, - "grad_norm": 0.4545675814151764, + "epoch": 0.0131392555790511, + "grad_norm": 0.4302205443382263, "learning_rate": 0.0002, - "loss": 1.215, + "loss": 1.2143, "step": 160 }, { - "epoch": 0.020488887624142738, - "grad_norm": 0.4772235155105591, + "epoch": 0.014781662526432487, + "grad_norm": 0.4064156413078308, "learning_rate": 0.0002, - "loss": 1.2173, + "loss": 1.1695, "step": 180 }, { - "epoch": 0.022765430693491934, - "grad_norm": 0.4403541088104248, + "epoch": 0.016424069473813873, + "grad_norm": 0.43175607919692993, "learning_rate": 0.0002, - "loss": 1.1058, + "loss": 1.1836, "step": 200 }, { - "epoch": 0.025041973762841126, - "grad_norm": 0.511401355266571, + "epoch": 0.01806647642119526, + "grad_norm": 0.5280532240867615, "learning_rate": 0.0002, - "loss": 1.1049, + "loss": 1.1627, "step": 220 }, { - "epoch": 0.027318516832190318, - "grad_norm": 0.3809013366699219, + "epoch": 0.01970888336857665, + "grad_norm": 0.4442996382713318, "learning_rate": 0.0002, - "loss": 1.0498, + "loss": 1.2294, "step": 240 }, { - "epoch": 0.029595059901539513, - "grad_norm": 0.3980010449886322, + "epoch": 0.021351290315958036, + "grad_norm": 0.4584205448627472, "learning_rate": 0.0002, - "loss": 0.9842, + "loss": 1.058, "step": 260 }, { - "epoch": 0.03187160297088871, - "grad_norm": 0.5747793316841125, + "epoch": 0.022993697263339424, + "grad_norm": 0.40979012846946716, "learning_rate": 0.0002, - "loss": 1.0988, + "loss": 1.0436, "step": 280 }, { - "epoch": 0.0341481460402379, - "grad_norm": 0.46827971935272217, + "epoch": 0.02463610421072081, + "grad_norm": 0.4241325557231903, "learning_rate": 0.0002, - "loss": 1.0367, + "loss": 1.1414, "step": 300 }, { - "epoch": 0.03642468910958709, - "grad_norm": 0.4702209532260895, + "epoch": 0.0262785111581022, + "grad_norm": 0.4106293022632599, "learning_rate": 0.0002, - "loss": 1.066, + "loss": 1.0744, "step": 320 }, { - "epoch": 0.038701232178936285, - "grad_norm": 0.5084996223449707, + "epoch": 0.027920918105483587, + "grad_norm": 0.46253764629364014, "learning_rate": 0.0002, - "loss": 1.0652, + "loss": 1.0589, "step": 340 }, { - "epoch": 0.040977775248285477, - "grad_norm": 0.3944012522697449, + "epoch": 0.029563325052864974, + "grad_norm": 0.4244967997074127, "learning_rate": 0.0002, - "loss": 0.9642, + "loss": 1.0263, "step": 360 }, { - "epoch": 0.04325431831763467, - "grad_norm": 0.40287718176841736, + "epoch": 0.031205732000246362, + "grad_norm": 0.35677096247673035, "learning_rate": 0.0002, - "loss": 0.9431, + "loss": 1.0447, "step": 380 }, { - "epoch": 0.04553086138698387, - "grad_norm": 0.4629077613353729, + "epoch": 0.032848138947627746, + "grad_norm": 0.4948490262031555, "learning_rate": 0.0002, - "loss": 0.9615, + "loss": 1.0826, "step": 400 }, { - "epoch": 0.04780740445633306, - "grad_norm": 0.44827452301979065, + "epoch": 0.034490545895009134, + "grad_norm": 0.5756106972694397, "learning_rate": 0.0002, - "loss": 0.9434, + "loss": 0.948, "step": 420 }, { - "epoch": 0.05008394752568225, - "grad_norm": 0.41644710302352905, + "epoch": 0.03613295284239052, + "grad_norm": 0.5383228063583374, "learning_rate": 0.0002, - "loss": 0.9241, + "loss": 1.0025, "step": 440 }, { - "epoch": 0.05236049059503144, - "grad_norm": 0.4760611057281494, + "epoch": 0.03777535978977191, + "grad_norm": 0.3955784738063812, "learning_rate": 0.0002, - "loss": 0.8475, + "loss": 0.9027, "step": 460 }, { - "epoch": 0.054637033664380635, - "grad_norm": 0.45987364649772644, + "epoch": 0.0394177667371533, + "grad_norm": 0.37915533781051636, "learning_rate": 0.0002, - "loss": 0.898, + "loss": 0.9936, "step": 480 }, { - "epoch": 0.056913576733729834, - "grad_norm": 0.4840068817138672, + "epoch": 0.041060173684534684, + "grad_norm": 0.5413188934326172, "learning_rate": 0.0002, - "loss": 0.9611, + "loss": 0.9077, "step": 500 }, { - "epoch": 0.059190119803079026, - "grad_norm": 0.40314286947250366, + "epoch": 0.04270258063191607, + "grad_norm": 0.5334627032279968, "learning_rate": 0.0002, - "loss": 0.8884, + "loss": 0.9009, "step": 520 }, { - "epoch": 0.06146666287242822, - "grad_norm": 0.5458106398582458, + "epoch": 0.04434498757929746, + "grad_norm": 0.5394805073738098, "learning_rate": 0.0002, - "loss": 0.8939, + "loss": 0.9542, "step": 540 }, { - "epoch": 0.06374320594177742, - "grad_norm": 0.5420896410942078, + "epoch": 0.04598739452667885, + "grad_norm": 0.532177746295929, "learning_rate": 0.0002, - "loss": 0.8265, + "loss": 0.8743, "step": 560 }, { - "epoch": 0.0660197490111266, - "grad_norm": 0.5356529355049133, + "epoch": 0.047629801474060235, + "grad_norm": 0.5266315937042236, "learning_rate": 0.0002, - "loss": 0.8432, + "loss": 0.8931, "step": 580 }, { - "epoch": 0.0682962920804758, - "grad_norm": 0.5064826011657715, + "epoch": 0.04927220842144162, + "grad_norm": 0.4725072979927063, "learning_rate": 0.0002, - "loss": 0.8272, + "loss": 0.908, "step": 600 }, { - "epoch": 0.07057283514982499, - "grad_norm": 0.4143005311489105, + "epoch": 0.05091461536882301, + "grad_norm": 0.6026243567466736, "learning_rate": 0.0002, - "loss": 0.7854, + "loss": 0.7898, "step": 620 }, { - "epoch": 0.07284937821917419, - "grad_norm": 0.3817225396633148, + "epoch": 0.0525570223162044, + "grad_norm": 0.4928111732006073, "learning_rate": 0.0002, - "loss": 0.8219, + "loss": 0.8406, "step": 640 }, { - "epoch": 0.07512592128852338, - "grad_norm": 0.5336936712265015, + "epoch": 0.054199429263585785, + "grad_norm": 0.4555020332336426, "learning_rate": 0.0002, - "loss": 0.7977, + "loss": 0.8222, "step": 660 }, { - "epoch": 0.07740246435787257, - "grad_norm": 0.5397001504898071, + "epoch": 0.05584183621096717, + "grad_norm": 0.6445655822753906, "learning_rate": 0.0002, - "loss": 0.8117, + "loss": 0.832, "step": 680 }, { - "epoch": 0.07967900742722177, - "grad_norm": 0.4968530535697937, + "epoch": 0.05748424315834856, + "grad_norm": 0.5854527950286865, "learning_rate": 0.0002, - "loss": 0.7527, + "loss": 0.8435, "step": 700 }, { - "epoch": 0.08195555049657095, - "grad_norm": 0.4084935784339905, + "epoch": 0.05912665010572995, + "grad_norm": 0.4609089195728302, "learning_rate": 0.0002, - "loss": 0.651, + "loss": 0.748, "step": 720 }, { - "epoch": 0.08423209356592015, - "grad_norm": 0.48406732082366943, + "epoch": 0.060769057053111336, + "grad_norm": 0.5567362904548645, "learning_rate": 0.0002, - "loss": 0.7352, + "loss": 0.7777, "step": 740 }, { - "epoch": 0.08650863663526934, - "grad_norm": 0.5246301293373108, + "epoch": 0.062411464000492724, + "grad_norm": 0.5161166191101074, "learning_rate": 0.0002, - "loss": 0.7785, + "loss": 0.7597, "step": 760 }, { - "epoch": 0.08878517970461854, - "grad_norm": 0.5729619264602661, + "epoch": 0.06405387094787411, + "grad_norm": 0.5450626611709595, "learning_rate": 0.0002, - "loss": 0.7646, + "loss": 0.7337, "step": 780 }, { - "epoch": 0.09106172277396773, - "grad_norm": 0.5675190687179565, + "epoch": 0.06569627789525549, + "grad_norm": 0.6034521460533142, "learning_rate": 0.0002, - "loss": 0.7784, + "loss": 0.7668, "step": 800 }, { - "epoch": 0.09333826584331692, - "grad_norm": 0.4682878255844116, + "epoch": 0.06733868484263689, + "grad_norm": 0.4653383493423462, "learning_rate": 0.0002, - "loss": 0.7284, + "loss": 0.7417, "step": 820 }, { - "epoch": 0.09561480891266612, - "grad_norm": 0.5388545393943787, + "epoch": 0.06898109179001827, + "grad_norm": 0.4846251308917999, "learning_rate": 0.0002, - "loss": 0.6959, + "loss": 0.7506, "step": 840 }, { - "epoch": 0.0978913519820153, - "grad_norm": 0.48806509375572205, + "epoch": 0.07062349873739966, + "grad_norm": 0.4887784719467163, "learning_rate": 0.0002, - "loss": 0.7585, + "loss": 0.7115, "step": 860 }, { - "epoch": 0.1001678950513645, - "grad_norm": 0.4149261713027954, + "epoch": 0.07226590568478104, + "grad_norm": 0.5024611949920654, "learning_rate": 0.0002, - "loss": 0.6978, + "loss": 0.7402, "step": 880 }, { - "epoch": 0.1024444381207137, - "grad_norm": 0.4971105754375458, + "epoch": 0.07390831263216244, + "grad_norm": 0.5007764101028442, "learning_rate": 0.0002, - "loss": 0.7103, + "loss": 0.6529, "step": 900 }, { - "epoch": 0.10472098119006289, - "grad_norm": 0.5066735744476318, + "epoch": 0.07555071957954382, + "grad_norm": 0.5097551345825195, "learning_rate": 0.0002, - "loss": 0.6854, + "loss": 0.7776, "step": 920 }, { - "epoch": 0.10699752425941209, - "grad_norm": 0.4922661781311035, + "epoch": 0.07719312652692521, + "grad_norm": 0.5517822504043579, "learning_rate": 0.0002, - "loss": 0.6231, + "loss": 0.6609, "step": 940 }, { - "epoch": 0.10927406732876127, - "grad_norm": 0.5949555039405823, + "epoch": 0.0788355334743066, + "grad_norm": 0.5290623307228088, "learning_rate": 0.0002, - "loss": 0.6813, + "loss": 0.7015, "step": 960 }, { - "epoch": 0.11155061039811047, - "grad_norm": 0.581446647644043, + "epoch": 0.08047794042168799, + "grad_norm": 0.576545000076294, "learning_rate": 0.0002, - "loss": 0.6174, + "loss": 0.6752, "step": 980 }, { - "epoch": 0.11382715346745967, - "grad_norm": 0.6152529716491699, + "epoch": 0.08212034736906937, + "grad_norm": 0.4689784049987793, "learning_rate": 0.0002, - "loss": 0.6405, + "loss": 0.7047, "step": 1000 }, { - "epoch": 0.11610369653680885, - "grad_norm": 0.5986836552619934, + "epoch": 0.08376275431645076, + "grad_norm": 0.455814003944397, "learning_rate": 0.0002, - "loss": 0.5776, + "loss": 0.6378, "step": 1020 }, { - "epoch": 0.11838023960615805, - "grad_norm": 0.4255094528198242, + "epoch": 0.08540516126383214, + "grad_norm": 0.6452861428260803, "learning_rate": 0.0002, - "loss": 0.6576, + "loss": 0.6962, "step": 1040 }, { - "epoch": 0.12065678267550724, - "grad_norm": 0.4563849866390228, + "epoch": 0.08704756821121354, + "grad_norm": 0.5699702501296997, "learning_rate": 0.0002, - "loss": 0.6647, + "loss": 0.6508, "step": 1060 }, { - "epoch": 0.12293332574485644, - "grad_norm": 0.593227744102478, + "epoch": 0.08868997515859492, + "grad_norm": 0.5086561441421509, "learning_rate": 0.0002, - "loss": 0.6043, + "loss": 0.6174, "step": 1080 }, { - "epoch": 0.12520986881420562, - "grad_norm": 0.47059598565101624, + "epoch": 0.09033238210597631, + "grad_norm": 0.48543211817741394, "learning_rate": 0.0002, - "loss": 0.591, + "loss": 0.6261, "step": 1100 }, { - "epoch": 0.12748641188355483, - "grad_norm": 0.5013225674629211, + "epoch": 0.0919747890533577, + "grad_norm": 0.6361482739448547, "learning_rate": 0.0002, - "loss": 0.5947, + "loss": 0.6336, "step": 1120 }, { - "epoch": 0.12976295495290402, - "grad_norm": 0.46772757172584534, + "epoch": 0.09361719600073909, + "grad_norm": 0.5558167695999146, "learning_rate": 0.0002, - "loss": 0.6292, + "loss": 0.6678, "step": 1140 }, { - "epoch": 0.1320394980222532, - "grad_norm": 0.5844313502311707, + "epoch": 0.09525960294812047, + "grad_norm": 0.5599238872528076, "learning_rate": 0.0002, - "loss": 0.6128, + "loss": 0.6169, "step": 1160 }, { - "epoch": 0.1343160410916024, - "grad_norm": 0.5295489430427551, + "epoch": 0.09690200989550186, + "grad_norm": 0.5939186215400696, "learning_rate": 0.0002, - "loss": 0.6064, + "loss": 0.6059, "step": 1180 }, { - "epoch": 0.1365925841609516, - "grad_norm": 0.4482004642486572, + "epoch": 0.09854441684288325, + "grad_norm": 0.5663330554962158, "learning_rate": 0.0002, - "loss": 0.5899, + "loss": 0.5737, "step": 1200 }, { - "epoch": 0.1388691272303008, - "grad_norm": 0.6281692981719971, + "epoch": 0.10018682379026464, + "grad_norm": 0.49742865562438965, "learning_rate": 0.0002, - "loss": 0.6109, + "loss": 0.6013, "step": 1220 }, { - "epoch": 0.14114567029964997, - "grad_norm": 0.4718242585659027, + "epoch": 0.10182923073764602, + "grad_norm": 0.520782470703125, "learning_rate": 0.0002, - "loss": 0.5857, + "loss": 0.5929, "step": 1240 }, { - "epoch": 0.14342221336899919, - "grad_norm": 0.5219341516494751, + "epoch": 0.1034716376850274, + "grad_norm": 0.45269444584846497, "learning_rate": 0.0002, - "loss": 0.5581, + "loss": 0.5981, "step": 1260 }, { - "epoch": 0.14569875643834837, - "grad_norm": 0.47050580382347107, + "epoch": 0.1051140446324088, + "grad_norm": 0.5428550243377686, "learning_rate": 0.0002, - "loss": 0.6368, + "loss": 0.5814, "step": 1280 }, { - "epoch": 0.14797529950769756, - "grad_norm": 0.5425338745117188, + "epoch": 0.10675645157979018, + "grad_norm": 0.4782160818576813, "learning_rate": 0.0002, - "loss": 0.5626, + "loss": 0.5858, "step": 1300 }, { - "epoch": 0.15025184257704677, - "grad_norm": 0.4944934844970703, + "epoch": 0.10839885852717157, + "grad_norm": 0.5338163375854492, "learning_rate": 0.0002, - "loss": 0.5337, + "loss": 0.6255, "step": 1320 }, { - "epoch": 0.15252838564639595, - "grad_norm": 0.5921599864959717, + "epoch": 0.11004126547455295, + "grad_norm": 0.4596363306045532, "learning_rate": 0.0002, - "loss": 0.5672, + "loss": 0.5974, "step": 1340 }, { - "epoch": 0.15480492871574514, - "grad_norm": 0.4866751730442047, + "epoch": 0.11168367242193435, + "grad_norm": 0.5203448534011841, "learning_rate": 0.0002, - "loss": 0.5305, + "loss": 0.5452, "step": 1360 }, { - "epoch": 0.15708147178509432, - "grad_norm": 0.62166827917099, + "epoch": 0.11332607936931573, + "grad_norm": 0.44463276863098145, "learning_rate": 0.0002, - "loss": 0.5737, + "loss": 0.576, "step": 1380 }, { - "epoch": 0.15935801485444354, - "grad_norm": 0.5006982684135437, + "epoch": 0.11496848631669712, + "grad_norm": 0.5106232762336731, "learning_rate": 0.0002, - "loss": 0.5542, + "loss": 0.5679, "step": 1400 }, { - "epoch": 0.16163455792379272, - "grad_norm": 0.6090095043182373, + "epoch": 0.1166108932640785, + "grad_norm": 0.5451502799987793, "learning_rate": 0.0002, - "loss": 0.5215, + "loss": 0.5673, "step": 1420 }, { - "epoch": 0.1639111009931419, - "grad_norm": 0.4260309636592865, + "epoch": 0.1182533002114599, + "grad_norm": 0.6638749837875366, "learning_rate": 0.0002, - "loss": 0.5535, + "loss": 0.543, "step": 1440 }, { - "epoch": 0.16618764406249112, - "grad_norm": 0.48657718300819397, + "epoch": 0.11989570715884128, + "grad_norm": 0.5045977830886841, "learning_rate": 0.0002, - "loss": 0.5441, + "loss": 0.5803, "step": 1460 }, { - "epoch": 0.1684641871318403, - "grad_norm": 0.43275007605552673, + "epoch": 0.12153811410622267, + "grad_norm": 0.5385071635246277, "learning_rate": 0.0002, - "loss": 0.5161, + "loss": 0.5357, "step": 1480 }, { - "epoch": 0.1707407302011895, - "grad_norm": 0.4225006699562073, + "epoch": 0.12318052105360405, + "grad_norm": 0.43107932806015015, "learning_rate": 0.0002, - "loss": 0.512, + "loss": 0.5378, "step": 1500 }, { - "epoch": 0.17301727327053867, - "grad_norm": 0.5176346302032471, + "epoch": 0.12482292800098545, + "grad_norm": 0.5887011885643005, "learning_rate": 0.0002, - "loss": 0.5384, + "loss": 0.5594, "step": 1520 }, { - "epoch": 0.1752938163398879, - "grad_norm": 0.6492679715156555, + "epoch": 0.12646533494836684, + "grad_norm": 0.547126829624176, "learning_rate": 0.0002, - "loss": 0.4981, + "loss": 0.5574, "step": 1540 }, { - "epoch": 0.17757035940923707, - "grad_norm": 0.5511758327484131, + "epoch": 0.12810774189574822, + "grad_norm": 0.532454788684845, "learning_rate": 0.0002, - "loss": 0.5289, + "loss": 0.5506, "step": 1560 }, { - "epoch": 0.17984690247858626, - "grad_norm": 0.5211341977119446, + "epoch": 0.1297501488431296, + "grad_norm": 0.592251718044281, "learning_rate": 0.0002, - "loss": 0.5002, + "loss": 0.5206, "step": 1580 }, { - "epoch": 0.18212344554793547, - "grad_norm": 0.5488260984420776, + "epoch": 0.13139255579051098, + "grad_norm": 0.6189798712730408, "learning_rate": 0.0002, - "loss": 0.5178, + "loss": 0.516, "step": 1600 }, { - "epoch": 0.18439998861728465, - "grad_norm": 0.6779264211654663, + "epoch": 0.1330349627378924, + "grad_norm": 0.4614121913909912, "learning_rate": 0.0002, - "loss": 0.5155, + "loss": 0.4948, "step": 1620 }, { - "epoch": 0.18667653168663384, - "grad_norm": 0.502919614315033, + "epoch": 0.13467736968527377, + "grad_norm": 0.6192139983177185, "learning_rate": 0.0002, - "loss": 0.4923, + "loss": 0.4924, "step": 1640 }, { - "epoch": 0.18895307475598305, - "grad_norm": 0.4989205300807953, + "epoch": 0.13631977663265515, + "grad_norm": 0.5383406281471252, "learning_rate": 0.0002, - "loss": 0.4825, + "loss": 0.4955, "step": 1660 }, { - "epoch": 0.19122961782533224, - "grad_norm": 0.5155315399169922, + "epoch": 0.13796218358003653, + "grad_norm": 0.681564450263977, "learning_rate": 0.0002, - "loss": 0.4796, + "loss": 0.5224, "step": 1680 }, { - "epoch": 0.19350616089468142, - "grad_norm": 0.5648865699768066, + "epoch": 0.13960459052741794, + "grad_norm": 0.51935875415802, "learning_rate": 0.0002, - "loss": 0.4985, + "loss": 0.508, "step": 1700 }, { - "epoch": 0.1957827039640306, - "grad_norm": 0.606176495552063, + "epoch": 0.14124699747479932, + "grad_norm": 0.532661497592926, "learning_rate": 0.0002, - "loss": 0.4819, + "loss": 0.5362, "step": 1720 }, { - "epoch": 0.19805924703337982, - "grad_norm": 0.5440786480903625, + "epoch": 0.1428894044221807, + "grad_norm": 0.40774333477020264, "learning_rate": 0.0002, - "loss": 0.5213, + "loss": 0.4908, "step": 1740 }, { - "epoch": 0.200335790102729, - "grad_norm": 0.43152502179145813, + "epoch": 0.14453181136956209, + "grad_norm": 0.6406064033508301, "learning_rate": 0.0002, - "loss": 0.4429, + "loss": 0.4891, "step": 1760 }, { - "epoch": 0.2026123331720782, - "grad_norm": 0.5701313614845276, + "epoch": 0.1461742183169435, + "grad_norm": 0.41497862339019775, "learning_rate": 0.0002, - "loss": 0.4486, + "loss": 0.5234, "step": 1780 }, { - "epoch": 0.2048888762414274, - "grad_norm": 0.565666913986206, + "epoch": 0.14781662526432487, + "grad_norm": 0.502389132976532, "learning_rate": 0.0002, - "loss": 0.4561, + "loss": 0.459, "step": 1800 }, { - "epoch": 0.2071654193107766, - "grad_norm": 0.5725598931312561, + "epoch": 0.14945903221170626, + "grad_norm": 0.5248283743858337, "learning_rate": 0.0002, - "loss": 0.4757, + "loss": 0.4659, "step": 1820 }, { - "epoch": 0.20944196238012577, - "grad_norm": 0.4642520248889923, + "epoch": 0.15110143915908764, + "grad_norm": 0.5587234497070312, "learning_rate": 0.0002, - "loss": 0.438, + "loss": 0.4877, "step": 1840 }, { - "epoch": 0.21171850544947496, - "grad_norm": 0.6077229976654053, + "epoch": 0.15274384610646902, + "grad_norm": 0.479913592338562, "learning_rate": 0.0002, - "loss": 0.4295, + "loss": 0.4598, "step": 1860 }, { - "epoch": 0.21399504851882417, - "grad_norm": 0.6314090490341187, + "epoch": 0.15438625305385043, + "grad_norm": 0.5423480272293091, "learning_rate": 0.0002, - "loss": 0.449, + "loss": 0.4754, "step": 1880 }, { - "epoch": 0.21627159158817336, - "grad_norm": 0.4416756331920624, + "epoch": 0.1560286600012318, + "grad_norm": 0.5485461354255676, "learning_rate": 0.0002, - "loss": 0.4554, + "loss": 0.4681, "step": 1900 }, { - "epoch": 0.21854813465752254, - "grad_norm": 0.5278882384300232, + "epoch": 0.1576710669486132, + "grad_norm": 0.48511844873428345, "learning_rate": 0.0002, - "loss": 0.4554, + "loss": 0.4672, "step": 1920 }, { - "epoch": 0.22082467772687175, - "grad_norm": 0.45619043707847595, + "epoch": 0.15931347389599457, + "grad_norm": 0.49132347106933594, "learning_rate": 0.0002, - "loss": 0.4868, + "loss": 0.4694, "step": 1940 }, { - "epoch": 0.22310122079622094, - "grad_norm": 0.5881581902503967, + "epoch": 0.16095588084337598, + "grad_norm": 0.5654798746109009, "learning_rate": 0.0002, - "loss": 0.4672, + "loss": 0.5047, "step": 1960 }, { - "epoch": 0.22537776386557012, - "grad_norm": 0.5379284024238586, + "epoch": 0.16259828779075736, + "grad_norm": 0.571369469165802, "learning_rate": 0.0002, - "loss": 0.4531, + "loss": 0.4486, "step": 1980 }, { - "epoch": 0.22765430693491934, - "grad_norm": 0.5562624931335449, + "epoch": 0.16424069473813874, + "grad_norm": 0.5438801646232605, "learning_rate": 0.0002, - "loss": 0.464, + "loss": 0.4756, "step": 2000 }, { - "epoch": 0.22993085000426852, - "grad_norm": 0.554499626159668, + "epoch": 0.16588310168552012, + "grad_norm": 0.5384829044342041, "learning_rate": 0.0002, - "loss": 0.446, + "loss": 0.4404, "step": 2020 }, { - "epoch": 0.2322073930736177, - "grad_norm": 0.509219229221344, + "epoch": 0.16752550863290153, + "grad_norm": 0.5565232634544373, "learning_rate": 0.0002, - "loss": 0.4417, + "loss": 0.4672, "step": 2040 }, { - "epoch": 0.2344839361429669, - "grad_norm": 0.5206849575042725, + "epoch": 0.1691679155802829, + "grad_norm": 0.5227774381637573, "learning_rate": 0.0002, - "loss": 0.4118, + "loss": 0.4452, "step": 2060 }, { - "epoch": 0.2367604792123161, - "grad_norm": 0.548729658126831, + "epoch": 0.1708103225276643, + "grad_norm": 0.47740334272384644, "learning_rate": 0.0002, - "loss": 0.4067, + "loss": 0.492, "step": 2080 }, { - "epoch": 0.2390370222816653, - "grad_norm": 0.4220084846019745, + "epoch": 0.17245272947504567, + "grad_norm": 0.4206157326698303, "learning_rate": 0.0002, - "loss": 0.428, + "loss": 0.4517, "step": 2100 }, { - "epoch": 0.24131356535101448, - "grad_norm": 0.5507292747497559, + "epoch": 0.17409513642242708, + "grad_norm": 0.5148787498474121, "learning_rate": 0.0002, - "loss": 0.4176, + "loss": 0.4801, "step": 2120 }, { - "epoch": 0.2435901084203637, - "grad_norm": 0.5605701208114624, + "epoch": 0.17573754336980846, + "grad_norm": 0.4815204441547394, "learning_rate": 0.0002, - "loss": 0.4661, + "loss": 0.4415, "step": 2140 }, { - "epoch": 0.24586665148971287, - "grad_norm": 0.43142881989479065, + "epoch": 0.17737995031718984, + "grad_norm": 0.5302825570106506, "learning_rate": 0.0002, - "loss": 0.4197, + "loss": 0.4558, "step": 2160 }, { - "epoch": 0.24814319455906206, - "grad_norm": 0.47790080308914185, + "epoch": 0.17902235726457122, + "grad_norm": 0.574350118637085, "learning_rate": 0.0002, - "loss": 0.4568, + "loss": 0.4709, "step": 2180 }, { - "epoch": 0.25041973762841124, - "grad_norm": 0.6048968434333801, + "epoch": 0.18066476421195263, + "grad_norm": 0.5393965244293213, "learning_rate": 0.0002, - "loss": 0.4199, + "loss": 0.4528, "step": 2200 }, { - "epoch": 0.25269628069776046, - "grad_norm": 0.4925907850265503, + "epoch": 0.182307171159334, + "grad_norm": 0.43285471200942993, "learning_rate": 0.0002, - "loss": 0.4325, + "loss": 0.4294, "step": 2220 }, { - "epoch": 0.25497282376710967, - "grad_norm": 0.5463051199913025, + "epoch": 0.1839495781067154, + "grad_norm": 0.4550113081932068, "learning_rate": 0.0002, - "loss": 0.4549, + "loss": 0.4395, "step": 2240 }, { - "epoch": 0.2572493668364588, - "grad_norm": 0.4631319046020508, + "epoch": 0.18559198505409677, + "grad_norm": 0.586071789264679, "learning_rate": 0.0002, - "loss": 0.3977, + "loss": 0.4456, "step": 2260 }, { - "epoch": 0.25952590990580804, - "grad_norm": 0.4965234398841858, + "epoch": 0.18723439200147818, + "grad_norm": 0.5634139776229858, "learning_rate": 0.0002, - "loss": 0.4285, + "loss": 0.4295, "step": 2280 }, { - "epoch": 0.2618024529751572, - "grad_norm": 0.5436238646507263, + "epoch": 0.18887679894885956, + "grad_norm": 0.5095311403274536, "learning_rate": 0.0002, - "loss": 0.4039, + "loss": 0.4347, "step": 2300 }, { - "epoch": 0.2640789960445064, - "grad_norm": 0.5218191742897034, + "epoch": 0.19051920589624094, + "grad_norm": 0.6051989793777466, "learning_rate": 0.0002, - "loss": 0.4092, + "loss": 0.4278, "step": 2320 }, { - "epoch": 0.2663555391138556, - "grad_norm": 0.5417261719703674, + "epoch": 0.19216161284362232, + "grad_norm": 0.45743292570114136, "learning_rate": 0.0002, - "loss": 0.3825, + "loss": 0.4191, "step": 2340 }, { - "epoch": 0.2686320821832048, - "grad_norm": 0.6126281023025513, + "epoch": 0.19380401979100373, + "grad_norm": 0.6048611402511597, "learning_rate": 0.0002, - "loss": 0.4391, + "loss": 0.4512, "step": 2360 }, { - "epoch": 0.270908625252554, - "grad_norm": 0.4734433889389038, + "epoch": 0.1954464267383851, + "grad_norm": 0.495731920003891, "learning_rate": 0.0002, - "loss": 0.4151, + "loss": 0.4087, "step": 2380 }, { - "epoch": 0.2731851683219032, - "grad_norm": 0.4501429796218872, + "epoch": 0.1970888336857665, + "grad_norm": 0.5746319890022278, "learning_rate": 0.0002, - "loss": 0.4178, + "loss": 0.4112, "step": 2400 }, { - "epoch": 0.27546171139125236, - "grad_norm": 0.5258509516716003, + "epoch": 0.19873124063314787, + "grad_norm": 0.4899024963378906, "learning_rate": 0.0002, - "loss": 0.4007, + "loss": 0.4403, "step": 2420 }, { - "epoch": 0.2777382544606016, - "grad_norm": 0.47874951362609863, + "epoch": 0.20037364758052928, + "grad_norm": 0.40732160210609436, "learning_rate": 0.0002, - "loss": 0.4245, + "loss": 0.4281, "step": 2440 }, { - "epoch": 0.2800147975299508, - "grad_norm": 0.528533399105072, + "epoch": 0.20201605452791066, + "grad_norm": 0.4896198809146881, "learning_rate": 0.0002, - "loss": 0.3794, + "loss": 0.4533, "step": 2460 }, { - "epoch": 0.28229134059929994, - "grad_norm": 0.46465063095092773, + "epoch": 0.20365846147529204, + "grad_norm": 0.5733948349952698, "learning_rate": 0.0002, - "loss": 0.4019, + "loss": 0.4113, "step": 2480 }, { - "epoch": 0.28456788366864916, - "grad_norm": 0.5217177867889404, + "epoch": 0.20530086842267342, + "grad_norm": 0.4565046429634094, "learning_rate": 0.0002, - "loss": 0.4104, + "loss": 0.4237, "step": 2500 }, { - "epoch": 0.28684442673799837, - "grad_norm": 0.510036289691925, + "epoch": 0.2069432753700548, + "grad_norm": 0.5932797789573669, "learning_rate": 0.0002, - "loss": 0.389, + "loss": 0.4367, "step": 2520 }, { - "epoch": 0.2891209698073475, - "grad_norm": 0.6968228220939636, + "epoch": 0.2085856823174362, + "grad_norm": 0.5838333368301392, "learning_rate": 0.0002, - "loss": 0.4152, + "loss": 0.4331, "step": 2540 }, { - "epoch": 0.29139751287669674, - "grad_norm": 0.4529867470264435, + "epoch": 0.2102280892648176, + "grad_norm": 0.5022397637367249, "learning_rate": 0.0002, - "loss": 0.3987, + "loss": 0.4004, "step": 2560 }, { - "epoch": 0.29367405594604595, - "grad_norm": 0.5680263638496399, + "epoch": 0.21187049621219897, + "grad_norm": 0.5949686765670776, "learning_rate": 0.0002, - "loss": 0.3828, + "loss": 0.4119, "step": 2580 }, { - "epoch": 0.2959505990153951, - "grad_norm": 0.4892405867576599, + "epoch": 0.21351290315958035, + "grad_norm": 0.45230528712272644, "learning_rate": 0.0002, - "loss": 0.4006, + "loss": 0.4217, "step": 2600 }, { - "epoch": 0.2982271420847443, - "grad_norm": 0.47588276863098145, + "epoch": 0.21515531010696176, + "grad_norm": 0.4186144471168518, "learning_rate": 0.0002, - "loss": 0.4197, + "loss": 0.428, "step": 2620 }, { - "epoch": 0.30050368515409354, - "grad_norm": 0.5624070167541504, + "epoch": 0.21679771705434314, + "grad_norm": 0.5562434196472168, "learning_rate": 0.0002, - "loss": 0.3997, + "loss": 0.394, "step": 2640 }, { - "epoch": 0.3027802282234427, - "grad_norm": 0.5434039831161499, + "epoch": 0.21844012400172452, + "grad_norm": 0.5947513580322266, "learning_rate": 0.0002, - "loss": 0.3977, + "loss": 0.3998, "step": 2660 }, { - "epoch": 0.3050567712927919, - "grad_norm": 0.5572277903556824, + "epoch": 0.2200825309491059, + "grad_norm": 0.4886711835861206, "learning_rate": 0.0002, - "loss": 0.3966, + "loss": 0.389, "step": 2680 }, { - "epoch": 0.30733331436214106, - "grad_norm": 0.5533374547958374, + "epoch": 0.2217249378964873, + "grad_norm": 0.551491379737854, "learning_rate": 0.0002, - "loss": 0.3803, + "loss": 0.3952, "step": 2700 }, { - "epoch": 0.3096098574314903, - "grad_norm": 0.40596967935562134, + "epoch": 0.2233673448438687, + "grad_norm": 0.383627712726593, "learning_rate": 0.0002, - "loss": 0.3682, + "loss": 0.3733, "step": 2720 }, { - "epoch": 0.3118864005008395, - "grad_norm": 0.4737823009490967, + "epoch": 0.22500975179125007, + "grad_norm": 0.45694270730018616, "learning_rate": 0.0002, - "loss": 0.3761, + "loss": 0.4075, "step": 2740 }, { - "epoch": 0.31416294357018865, - "grad_norm": 0.4295174777507782, + "epoch": 0.22665215873863145, + "grad_norm": 0.46876367926597595, "learning_rate": 0.0002, - "loss": 0.4035, + "loss": 0.4135, "step": 2760 }, { - "epoch": 0.31643948663953786, - "grad_norm": 0.5348454713821411, + "epoch": 0.22829456568601286, + "grad_norm": 0.9062886238098145, "learning_rate": 0.0002, - "loss": 0.404, + "loss": 0.3891, "step": 2780 }, { - "epoch": 0.31871602970888707, - "grad_norm": 0.4819965362548828, + "epoch": 0.22993697263339424, + "grad_norm": 0.47902002930641174, "learning_rate": 0.0002, - "loss": 0.3929, + "loss": 0.405, "step": 2800 }, { - "epoch": 0.32099257277823623, - "grad_norm": 0.5920088291168213, + "epoch": 0.23157937958077562, + "grad_norm": 0.6828575134277344, "learning_rate": 0.0002, - "loss": 0.3798, + "loss": 0.3985, "step": 2820 }, { - "epoch": 0.32326911584758544, - "grad_norm": 0.4936531186103821, + "epoch": 0.233221786528157, + "grad_norm": 0.5411036610603333, "learning_rate": 0.0002, - "loss": 0.3995, + "loss": 0.3658, "step": 2840 }, { - "epoch": 0.32554565891693465, - "grad_norm": 0.5252315998077393, + "epoch": 0.2348641934755384, + "grad_norm": 0.6698014736175537, "learning_rate": 0.0002, - "loss": 0.3842, + "loss": 0.4003, "step": 2860 }, { - "epoch": 0.3278222019862838, - "grad_norm": 0.5818414688110352, + "epoch": 0.2365066004229198, + "grad_norm": 0.5779656171798706, "learning_rate": 0.0002, - "loss": 0.3533, + "loss": 0.4003, "step": 2880 }, { - "epoch": 0.330098745055633, - "grad_norm": 0.44053876399993896, + "epoch": 0.23814900737030117, + "grad_norm": 0.5321545004844666, "learning_rate": 0.0002, - "loss": 0.3402, + "loss": 0.3667, "step": 2900 }, { - "epoch": 0.33237528812498224, - "grad_norm": 0.5421345233917236, + "epoch": 0.23979141431768256, + "grad_norm": 0.43935510516166687, "learning_rate": 0.0002, - "loss": 0.3542, + "loss": 0.375, "step": 2920 }, { - "epoch": 0.3346518311943314, - "grad_norm": 0.4642751216888428, + "epoch": 0.24143382126506396, + "grad_norm": 0.67582768201828, "learning_rate": 0.0002, - "loss": 0.3755, + "loss": 0.3814, "step": 2940 }, { - "epoch": 0.3369283742636806, - "grad_norm": 0.5137833952903748, + "epoch": 0.24307622821244534, + "grad_norm": 0.6373169422149658, "learning_rate": 0.0002, - "loss": 0.3602, + "loss": 0.4079, "step": 2960 }, { - "epoch": 0.3392049173330298, - "grad_norm": 0.5032792687416077, + "epoch": 0.24471863515982673, + "grad_norm": 0.4568232595920563, "learning_rate": 0.0002, - "loss": 0.3451, + "loss": 0.3821, "step": 2980 }, { - "epoch": 0.341481460402379, - "grad_norm": 0.4932720363140106, + "epoch": 0.2463610421072081, + "grad_norm": 0.5706847310066223, "learning_rate": 0.0002, - "loss": 0.384, + "loss": 0.3745, "step": 3000 }, { - "epoch": 0.3437580034717282, - "grad_norm": 0.49986231327056885, + "epoch": 0.24800344905458951, + "grad_norm": 0.5293543338775635, "learning_rate": 0.0002, - "loss": 0.3826, + "loss": 0.3945, "step": 3020 }, { - "epoch": 0.34603454654107735, - "grad_norm": 0.6325618624687195, + "epoch": 0.2496458560019709, + "grad_norm": 0.5566920042037964, "learning_rate": 0.0002, - "loss": 0.3582, + "loss": 0.3739, "step": 3040 }, { - "epoch": 0.34831108961042656, - "grad_norm": 0.5402369499206543, + "epoch": 0.2512882629493523, + "grad_norm": 0.5758338570594788, "learning_rate": 0.0002, - "loss": 0.3706, + "loss": 0.4115, "step": 3060 }, { - "epoch": 0.3505876326797758, - "grad_norm": 0.4967012107372284, + "epoch": 0.2529306698967337, + "grad_norm": 0.5503116250038147, "learning_rate": 0.0002, - "loss": 0.3456, + "loss": 0.3841, "step": 3080 }, { - "epoch": 0.35286417574912493, - "grad_norm": 0.4491735100746155, + "epoch": 0.25457307684411506, + "grad_norm": 0.5829768776893616, "learning_rate": 0.0002, - "loss": 0.347, + "loss": 0.3679, "step": 3100 }, { - "epoch": 0.35514071881847414, - "grad_norm": 0.9062516093254089, + "epoch": 0.25621548379149645, + "grad_norm": 0.4771459400653839, "learning_rate": 0.0002, - "loss": 0.3617, + "loss": 0.3787, "step": 3120 }, { - "epoch": 0.35741726188782336, - "grad_norm": 0.5253359079360962, + "epoch": 0.2578578907388778, + "grad_norm": 0.508679986000061, "learning_rate": 0.0002, - "loss": 0.3512, + "loss": 0.3424, "step": 3140 }, { - "epoch": 0.3596938049571725, - "grad_norm": 0.4836867153644562, + "epoch": 0.2595002976862592, + "grad_norm": 0.5478394031524658, "learning_rate": 0.0002, - "loss": 0.3585, + "loss": 0.3616, "step": 3160 }, { - "epoch": 0.3619703480265217, - "grad_norm": 0.49537473917007446, + "epoch": 0.2611427046336406, + "grad_norm": 0.48918816447257996, "learning_rate": 0.0002, "loss": 0.364, "step": 3180 }, { - "epoch": 0.36424689109587094, - "grad_norm": 0.6098095178604126, + "epoch": 0.26278511158102197, + "grad_norm": 0.6158058047294617, "learning_rate": 0.0002, - "loss": 0.3455, + "loss": 0.3563, "step": 3200 }, { - "epoch": 0.3665234341652201, - "grad_norm": 0.5926884412765503, + "epoch": 0.26442751852840335, + "grad_norm": 0.6302765607833862, "learning_rate": 0.0002, - "loss": 0.3406, + "loss": 0.3472, "step": 3220 }, { - "epoch": 0.3687999772345693, - "grad_norm": 0.5868669152259827, + "epoch": 0.2660699254757848, + "grad_norm": 0.42650097608566284, "learning_rate": 0.0002, - "loss": 0.3643, + "loss": 0.374, "step": 3240 }, { - "epoch": 0.3710765203039185, - "grad_norm": 0.42670106887817383, + "epoch": 0.26771233242316617, + "grad_norm": 0.5517419576644897, "learning_rate": 0.0002, - "loss": 0.344, + "loss": 0.3747, "step": 3260 }, { - "epoch": 0.3733530633732677, - "grad_norm": 0.5992838740348816, + "epoch": 0.26935473937054755, + "grad_norm": 0.5887686014175415, "learning_rate": 0.0002, - "loss": 0.3588, + "loss": 0.3655, "step": 3280 }, { - "epoch": 0.3756296064426169, - "grad_norm": 0.4388341009616852, + "epoch": 0.2709971463179289, + "grad_norm": 0.5252538323402405, "learning_rate": 0.0002, - "loss": 0.3375, + "loss": 0.3864, "step": 3300 }, { - "epoch": 0.3779061495119661, - "grad_norm": 0.596488893032074, + "epoch": 0.2726395532653103, + "grad_norm": 0.4829944968223572, "learning_rate": 0.0002, - "loss": 0.3425, + "loss": 0.3526, "step": 3320 }, { - "epoch": 0.38018269258131526, - "grad_norm": 0.4572538137435913, + "epoch": 0.2742819602126917, + "grad_norm": 0.4375133216381073, "learning_rate": 0.0002, - "loss": 0.3711, + "loss": 0.3536, "step": 3340 }, { - "epoch": 0.3824592356506645, - "grad_norm": 0.5661656856536865, + "epoch": 0.27592436716007307, + "grad_norm": 0.5371789336204529, "learning_rate": 0.0002, - "loss": 0.3415, + "loss": 0.3501, "step": 3360 }, { - "epoch": 0.38473577872001363, - "grad_norm": 0.45082923769950867, + "epoch": 0.27756677410745445, + "grad_norm": 0.44075456261634827, "learning_rate": 0.0002, - "loss": 0.3495, + "loss": 0.3584, "step": 3380 }, { - "epoch": 0.38701232178936285, - "grad_norm": 0.4995211660861969, + "epoch": 0.2792091810548359, + "grad_norm": 0.53825443983078, "learning_rate": 0.0002, - "loss": 0.3311, + "loss": 0.3304, "step": 3400 }, { - "epoch": 0.38928886485871206, - "grad_norm": 0.5004004240036011, + "epoch": 0.28085158800221727, + "grad_norm": 0.48521581292152405, "learning_rate": 0.0002, - "loss": 0.3506, + "loss": 0.3588, "step": 3420 }, { - "epoch": 0.3915654079280612, - "grad_norm": 0.5676460266113281, + "epoch": 0.28249399494959865, + "grad_norm": 0.4189339578151703, "learning_rate": 0.0002, - "loss": 0.3383, + "loss": 0.3556, "step": 3440 }, { - "epoch": 0.39384195099741043, - "grad_norm": 0.4805515706539154, + "epoch": 0.28413640189698003, + "grad_norm": 0.4011813700199127, "learning_rate": 0.0002, - "loss": 0.3382, + "loss": 0.3403, "step": 3460 }, { - "epoch": 0.39611849406675964, - "grad_norm": 0.47675764560699463, + "epoch": 0.2857788088443614, + "grad_norm": 0.4910661280155182, "learning_rate": 0.0002, - "loss": 0.3021, + "loss": 0.3897, "step": 3480 }, { - "epoch": 0.3983950371361088, - "grad_norm": 0.6285260915756226, + "epoch": 0.2874212157917428, + "grad_norm": 0.5664734840393066, "learning_rate": 0.0002, - "loss": 0.3467, + "loss": 0.3503, "step": 3500 }, { - "epoch": 0.400671580205458, - "grad_norm": 0.5657575130462646, + "epoch": 0.28906362273912417, + "grad_norm": 0.45044422149658203, "learning_rate": 0.0002, - "loss": 0.3382, + "loss": 0.3357, "step": 3520 }, { - "epoch": 0.4029481232748072, - "grad_norm": 0.6148316860198975, + "epoch": 0.29070602968650555, + "grad_norm": 0.6162013411521912, "learning_rate": 0.0002, - "loss": 0.3396, + "loss": 0.3827, "step": 3540 }, { - "epoch": 0.4052246663441564, - "grad_norm": 0.5819992423057556, + "epoch": 0.292348436633887, + "grad_norm": 0.428659588098526, "learning_rate": 0.0002, - "loss": 0.3373, + "loss": 0.3418, "step": 3560 }, { - "epoch": 0.4075012094135056, - "grad_norm": 0.6080338954925537, + "epoch": 0.29399084358126837, + "grad_norm": 0.48843899369239807, "learning_rate": 0.0002, - "loss": 0.3463, + "loss": 0.3695, "step": 3580 }, { - "epoch": 0.4097777524828548, - "grad_norm": 0.6103864312171936, + "epoch": 0.29563325052864975, + "grad_norm": 0.5662574768066406, "learning_rate": 0.0002, - "loss": 0.3441, + "loss": 0.3418, "step": 3600 }, { - "epoch": 0.41205429555220396, - "grad_norm": 0.5234800577163696, + "epoch": 0.29727565747603113, + "grad_norm": 0.5488101243972778, "learning_rate": 0.0002, - "loss": 0.3272, + "loss": 0.3619, "step": 3620 }, { - "epoch": 0.4143308386215532, - "grad_norm": 0.5393822193145752, + "epoch": 0.2989180644234125, + "grad_norm": 0.4078102111816406, "learning_rate": 0.0002, - "loss": 0.3308, + "loss": 0.3339, "step": 3640 }, { - "epoch": 0.4166073816909024, - "grad_norm": 0.4853431284427643, + "epoch": 0.3005604713707939, + "grad_norm": 0.6991748213768005, "learning_rate": 0.0002, - "loss": 0.3152, + "loss": 0.3653, "step": 3660 }, { - "epoch": 0.41888392476025155, - "grad_norm": 0.5507264733314514, + "epoch": 0.30220287831817527, + "grad_norm": 0.4532040059566498, "learning_rate": 0.0002, - "loss": 0.3229, + "loss": 0.343, "step": 3680 }, { - "epoch": 0.42116046782960076, - "grad_norm": 0.44306129217147827, + "epoch": 0.30384528526555665, + "grad_norm": 0.47306913137435913, "learning_rate": 0.0002, - "loss": 0.3389, + "loss": 0.3551, "step": 3700 }, { - "epoch": 0.4234370108989499, - "grad_norm": 0.4574294984340668, + "epoch": 0.30548769221293803, + "grad_norm": 0.4408378303050995, "learning_rate": 0.0002, - "loss": 0.3516, + "loss": 0.3441, "step": 3720 }, { - "epoch": 0.42571355396829913, - "grad_norm": 0.5367994904518127, + "epoch": 0.30713009916031947, + "grad_norm": 0.5125454068183899, "learning_rate": 0.0002, - "loss": 0.3576, + "loss": 0.3578, "step": 3740 }, { - "epoch": 0.42799009703764834, - "grad_norm": 0.5044491291046143, + "epoch": 0.30877250610770085, + "grad_norm": 0.5483905076980591, "learning_rate": 0.0002, - "loss": 0.3449, + "loss": 0.3344, "step": 3760 }, { - "epoch": 0.4302666401069975, - "grad_norm": 0.41715556383132935, + "epoch": 0.31041491305508223, + "grad_norm": 0.3780999779701233, "learning_rate": 0.0002, - "loss": 0.3128, + "loss": 0.3491, "step": 3780 }, { - "epoch": 0.4325431831763467, - "grad_norm": 0.4355817437171936, + "epoch": 0.3120573200024636, + "grad_norm": 0.4443167746067047, "learning_rate": 0.0002, - "loss": 0.3131, + "loss": 0.3406, "step": 3800 }, { - "epoch": 0.4348197262456959, - "grad_norm": 0.5237382650375366, + "epoch": 0.313699726949845, + "grad_norm": 0.5337740182876587, "learning_rate": 0.0002, - "loss": 0.3281, + "loss": 0.3369, "step": 3820 }, { - "epoch": 0.4370962693150451, - "grad_norm": 0.6210081577301025, + "epoch": 0.3153421338972264, + "grad_norm": 0.5371155738830566, "learning_rate": 0.0002, - "loss": 0.3195, + "loss": 0.3579, "step": 3840 }, { - "epoch": 0.4393728123843943, - "grad_norm": 0.5145352482795715, + "epoch": 0.31698454084460775, + "grad_norm": 0.49183839559555054, "learning_rate": 0.0002, - "loss": 0.3107, + "loss": 0.3359, "step": 3860 }, { - "epoch": 0.4416493554537435, - "grad_norm": 0.5554608106613159, + "epoch": 0.31862694779198913, + "grad_norm": 0.5076944828033447, "learning_rate": 0.0002, - "loss": 0.3418, + "loss": 0.3604, "step": 3880 }, { - "epoch": 0.44392589852309267, - "grad_norm": 0.4971628487110138, + "epoch": 0.32026935473937057, + "grad_norm": 0.5076488256454468, "learning_rate": 0.0002, - "loss": 0.3293, + "loss": 0.3373, "step": 3900 }, { - "epoch": 0.4462024415924419, - "grad_norm": 0.49732130765914917, + "epoch": 0.32191176168675195, + "grad_norm": 0.519506573677063, "learning_rate": 0.0002, - "loss": 0.3138, + "loss": 0.3529, "step": 3920 }, { - "epoch": 0.4484789846617911, - "grad_norm": 0.5883257985115051, + "epoch": 0.32355416863413333, + "grad_norm": 0.3967176079750061, "learning_rate": 0.0002, - "loss": 0.3357, + "loss": 0.3203, "step": 3940 }, { - "epoch": 0.45075552773114025, - "grad_norm": 0.5349528193473816, + "epoch": 0.3251965755815147, + "grad_norm": 0.5084711313247681, "learning_rate": 0.0002, - "loss": 0.3381, + "loss": 0.3323, "step": 3960 }, { - "epoch": 0.45303207080048946, - "grad_norm": 0.5360047221183777, + "epoch": 0.3268389825288961, + "grad_norm": 0.5324501991271973, "learning_rate": 0.0002, - "loss": 0.3116, + "loss": 0.3351, "step": 3980 }, { - "epoch": 0.4553086138698387, - "grad_norm": 0.4889732003211975, + "epoch": 0.3284813894762775, + "grad_norm": 0.4679279923439026, "learning_rate": 0.0002, - "loss": 0.3154, + "loss": 0.322, "step": 4000 }, { - "epoch": 0.45758515693918783, - "grad_norm": 0.4912421703338623, + "epoch": 0.33012379642365886, + "grad_norm": 0.5273401737213135, "learning_rate": 0.0002, - "loss": 0.3054, + "loss": 0.358, "step": 4020 }, { - "epoch": 0.45986170000853704, - "grad_norm": 0.4449983835220337, + "epoch": 0.33176620337104024, + "grad_norm": 0.560130774974823, "learning_rate": 0.0002, - "loss": 0.3079, + "loss": 0.3252, "step": 4040 }, { - "epoch": 0.46213824307788626, - "grad_norm": 0.4488675892353058, + "epoch": 0.33340861031842167, + "grad_norm": 0.7334967851638794, "learning_rate": 0.0002, - "loss": 0.3027, + "loss": 0.3125, "step": 4060 }, { - "epoch": 0.4644147861472354, - "grad_norm": 0.5412561893463135, + "epoch": 0.33505101726580305, + "grad_norm": 0.448902428150177, "learning_rate": 0.0002, - "loss": 0.2932, + "loss": 0.3337, "step": 4080 }, { - "epoch": 0.4666913292165846, - "grad_norm": 0.41218650341033936, + "epoch": 0.33669342421318443, + "grad_norm": 0.42839765548706055, "learning_rate": 0.0002, - "loss": 0.3087, + "loss": 0.3332, "step": 4100 }, { - "epoch": 0.4689678722859338, - "grad_norm": 0.5233949422836304, + "epoch": 0.3383358311605658, + "grad_norm": 0.43117448687553406, "learning_rate": 0.0002, - "loss": 0.3157, + "loss": 0.3204, "step": 4120 }, { - "epoch": 0.471244415355283, - "grad_norm": 0.5676075220108032, + "epoch": 0.3399782381079472, + "grad_norm": 0.4213992953300476, "learning_rate": 0.0002, - "loss": 0.3267, + "loss": 0.3421, "step": 4140 }, { - "epoch": 0.4735209584246322, - "grad_norm": 0.5336834788322449, + "epoch": 0.3416206450553286, + "grad_norm": 0.40054526925086975, "learning_rate": 0.0002, - "loss": 0.3185, + "loss": 0.3115, "step": 4160 }, { - "epoch": 0.47579750149398137, - "grad_norm": 0.5505925416946411, + "epoch": 0.34326305200270996, + "grad_norm": 0.5090795159339905, "learning_rate": 0.0002, - "loss": 0.3116, + "loss": 0.3324, "step": 4180 }, { - "epoch": 0.4780740445633306, - "grad_norm": 0.5440223813056946, + "epoch": 0.34490545895009134, + "grad_norm": 0.5156223177909851, "learning_rate": 0.0002, - "loss": 0.3234, + "loss": 0.3186, "step": 4200 }, { - "epoch": 0.4803505876326798, - "grad_norm": 0.46334293484687805, + "epoch": 0.3465478658974728, + "grad_norm": 0.4297846555709839, "learning_rate": 0.0002, - "loss": 0.3209, + "loss": 0.312, "step": 4220 }, { - "epoch": 0.48262713070202895, - "grad_norm": 0.452364444732666, + "epoch": 0.34819027284485415, + "grad_norm": 0.4857240617275238, "learning_rate": 0.0002, - "loss": 0.3056, + "loss": 0.3202, "step": 4240 }, { - "epoch": 0.48490367377137816, - "grad_norm": 0.5037956833839417, + "epoch": 0.34983267979223553, + "grad_norm": 0.6078678965568542, "learning_rate": 0.0002, - "loss": 0.3141, + "loss": 0.3329, "step": 4260 }, { - "epoch": 0.4871802168407274, - "grad_norm": 0.4308939278125763, + "epoch": 0.3514750867396169, + "grad_norm": 0.5576339364051819, "learning_rate": 0.0002, - "loss": 0.2948, + "loss": 0.333, "step": 4280 }, { - "epoch": 0.48945675991007653, - "grad_norm": 0.45019960403442383, + "epoch": 0.3531174936869983, + "grad_norm": 0.5340404510498047, "learning_rate": 0.0002, - "loss": 0.3142, + "loss": 0.3367, "step": 4300 }, { - "epoch": 0.49173330297942575, - "grad_norm": 0.4351404011249542, + "epoch": 0.3547599006343797, + "grad_norm": 0.5187095999717712, "learning_rate": 0.0002, - "loss": 0.31, + "loss": 0.3579, "step": 4320 }, { - "epoch": 0.49400984604877496, - "grad_norm": 0.38306841254234314, + "epoch": 0.35640230758176106, + "grad_norm": 0.4246378540992737, "learning_rate": 0.0002, - "loss": 0.2889, + "loss": 0.3281, "step": 4340 }, { - "epoch": 0.4962863891181241, - "grad_norm": 0.545360803604126, + "epoch": 0.35804471452914244, + "grad_norm": 0.6137174963951111, "learning_rate": 0.0002, - "loss": 0.311, + "loss": 0.3248, "step": 4360 }, { - "epoch": 0.49856293218747333, - "grad_norm": 0.44942232966423035, + "epoch": 0.3596871214765238, + "grad_norm": 0.44220972061157227, "learning_rate": 0.0002, - "loss": 0.2899, + "loss": 0.3267, "step": 4380 }, { - "epoch": 0.5008394752568225, - "grad_norm": 0.46564239263534546, + "epoch": 0.36132952842390526, + "grad_norm": 0.4254567325115204, "learning_rate": 0.0002, - "loss": 0.3013, + "loss": 0.315, "step": 4400 }, { - "epoch": 0.5031160183261717, - "grad_norm": 0.5398554801940918, + "epoch": 0.36297193537128664, + "grad_norm": 0.66693115234375, "learning_rate": 0.0002, - "loss": 0.3104, + "loss": 0.3354, "step": 4420 }, { - "epoch": 0.5053925613955209, - "grad_norm": 0.47367504239082336, + "epoch": 0.364614342318668, + "grad_norm": 0.5646852254867554, "learning_rate": 0.0002, - "loss": 0.2945, + "loss": 0.3275, "step": 4440 }, { - "epoch": 0.5076691044648701, - "grad_norm": 0.45659711956977844, + "epoch": 0.3662567492660494, + "grad_norm": 0.525794506072998, "learning_rate": 0.0002, - "loss": 0.304, + "loss": 0.3095, "step": 4460 }, { - "epoch": 0.5099456475342193, - "grad_norm": 0.4942033290863037, + "epoch": 0.3678991562134308, + "grad_norm": 0.5454958081245422, "learning_rate": 0.0002, - "loss": 0.2969, + "loss": 0.3177, "step": 4480 }, { - "epoch": 0.5122221906035684, - "grad_norm": 0.46578243374824524, + "epoch": 0.36954156316081216, + "grad_norm": 0.5054097771644592, "learning_rate": 0.0002, - "loss": 0.2935, + "loss": 0.3291, "step": 4500 }, { - "epoch": 0.5144987336729177, - "grad_norm": 0.6523891687393188, + "epoch": 0.37118397010819354, + "grad_norm": 0.45259889960289, "learning_rate": 0.0002, - "loss": 0.2823, + "loss": 0.3309, "step": 4520 }, { - "epoch": 0.5167752767422669, - "grad_norm": 0.4787238538265228, + "epoch": 0.3728263770555749, + "grad_norm": 0.4160098135471344, "learning_rate": 0.0002, - "loss": 0.3148, + "loss": 0.3416, "step": 4540 }, { - "epoch": 0.5190518198116161, - "grad_norm": 0.46825891733169556, + "epoch": 0.37446878400295636, + "grad_norm": 0.36465033888816833, "learning_rate": 0.0002, - "loss": 0.3089, + "loss": 0.3244, "step": 4560 }, { - "epoch": 0.5213283628809653, - "grad_norm": 0.46605536341667175, + "epoch": 0.37611119095033774, + "grad_norm": 0.3822501301765442, "learning_rate": 0.0002, - "loss": 0.3012, + "loss": 0.3163, "step": 4580 }, { - "epoch": 0.5236049059503144, - "grad_norm": 0.5826888680458069, + "epoch": 0.3777535978977191, + "grad_norm": 0.4484947621822357, "learning_rate": 0.0002, - "loss": 0.3043, + "loss": 0.3186, "step": 4600 }, { - "epoch": 0.5258814490196636, - "grad_norm": 0.48641151189804077, + "epoch": 0.3793960048451005, + "grad_norm": 0.481303334236145, "learning_rate": 0.0002, - "loss": 0.2952, + "loss": 0.3202, "step": 4620 }, { - "epoch": 0.5281579920890128, - "grad_norm": 0.5396175384521484, + "epoch": 0.3810384117924819, + "grad_norm": 0.5275722742080688, "learning_rate": 0.0002, - "loss": 0.2926, + "loss": 0.319, "step": 4640 }, { - "epoch": 0.530434535158362, - "grad_norm": 0.5584241151809692, + "epoch": 0.38268081873986326, + "grad_norm": 0.5782263278961182, "learning_rate": 0.0002, - "loss": 0.3048, + "loss": 0.327, "step": 4660 }, { - "epoch": 0.5327110782277112, - "grad_norm": 0.5832685232162476, + "epoch": 0.38432322568724464, + "grad_norm": 0.511466920375824, "learning_rate": 0.0002, - "loss": 0.2948, + "loss": 0.3176, "step": 4680 }, { - "epoch": 0.5349876212970605, - "grad_norm": 0.4676337242126465, + "epoch": 0.385965632634626, + "grad_norm": 0.5383144617080688, "learning_rate": 0.0002, - "loss": 0.3043, + "loss": 0.3215, "step": 4700 }, { - "epoch": 0.5372641643664096, - "grad_norm": 0.4440428614616394, + "epoch": 0.38760803958200746, + "grad_norm": 0.47731462121009827, "learning_rate": 0.0002, - "loss": 0.288, + "loss": 0.3184, "step": 4720 }, { - "epoch": 0.5395407074357588, - "grad_norm": 0.49934279918670654, + "epoch": 0.38925044652938884, + "grad_norm": 0.43928396701812744, "learning_rate": 0.0002, - "loss": 0.2882, + "loss": 0.2998, "step": 4740 }, { - "epoch": 0.541817250505108, - "grad_norm": 0.5172054171562195, + "epoch": 0.3908928534767702, + "grad_norm": 0.47170737385749817, "learning_rate": 0.0002, - "loss": 0.3225, + "loss": 0.3211, "step": 4760 }, { - "epoch": 0.5440937935744572, - "grad_norm": 0.4527619183063507, + "epoch": 0.3925352604241516, + "grad_norm": 0.39744389057159424, "learning_rate": 0.0002, - "loss": 0.2869, + "loss": 0.3119, "step": 4780 }, { - "epoch": 0.5463703366438064, - "grad_norm": 0.548918604850769, + "epoch": 0.394177667371533, + "grad_norm": 0.4669509828090668, "learning_rate": 0.0002, - "loss": 0.3105, + "loss": 0.2965, "step": 4800 }, { - "epoch": 0.5486468797131556, - "grad_norm": 0.48801419138908386, + "epoch": 0.39582007431891436, + "grad_norm": 0.4926499128341675, "learning_rate": 0.0002, - "loss": 0.2835, + "loss": 0.2996, "step": 4820 }, { - "epoch": 0.5509234227825047, - "grad_norm": 0.49810609221458435, + "epoch": 0.39746248126629574, + "grad_norm": 0.4818594455718994, "learning_rate": 0.0002, - "loss": 0.3227, + "loss": 0.3116, "step": 4840 }, { - "epoch": 0.5531999658518539, - "grad_norm": 0.49763086438179016, + "epoch": 0.3991048882136771, + "grad_norm": 0.4344610571861267, "learning_rate": 0.0002, - "loss": 0.2786, + "loss": 0.2884, "step": 4860 }, { - "epoch": 0.5554765089212031, - "grad_norm": 0.48815059661865234, + "epoch": 0.40074729516105856, + "grad_norm": 0.3993249535560608, "learning_rate": 0.0002, - "loss": 0.2802, + "loss": 0.3096, "step": 4880 }, { - "epoch": 0.5577530519905524, - "grad_norm": 0.3571115732192993, + "epoch": 0.40238970210843994, + "grad_norm": 0.4467979967594147, "learning_rate": 0.0002, - "loss": 0.2796, + "loss": 0.2976, "step": 4900 }, { - "epoch": 0.5600295950599016, - "grad_norm": 0.6448425650596619, + "epoch": 0.4040321090558213, + "grad_norm": 0.5102105736732483, "learning_rate": 0.0002, - "loss": 0.2844, + "loss": 0.3005, "step": 4920 }, { - "epoch": 0.5623061381292508, - "grad_norm": 0.49660468101501465, + "epoch": 0.4056745160032027, + "grad_norm": 0.49601197242736816, "learning_rate": 0.0002, - "loss": 0.2892, + "loss": 0.2983, "step": 4940 }, { - "epoch": 0.5645826811985999, - "grad_norm": 0.47702720761299133, + "epoch": 0.4073169229505841, + "grad_norm": 0.39463695883750916, "learning_rate": 0.0002, - "loss": 0.3111, + "loss": 0.3071, "step": 4960 }, { - "epoch": 0.5668592242679491, - "grad_norm": 0.5281921029090881, + "epoch": 0.40895932989796546, + "grad_norm": 0.5963265299797058, "learning_rate": 0.0002, - "loss": 0.2908, + "loss": 0.3017, "step": 4980 }, { - "epoch": 0.5691357673372983, - "grad_norm": 0.6427987813949585, + "epoch": 0.41060173684534684, + "grad_norm": 0.5571741461753845, "learning_rate": 0.0002, - "loss": 0.2848, + "loss": 0.312, "step": 5000 }, { - "epoch": 0.5714123104066475, - "grad_norm": 0.5437233448028564, + "epoch": 0.4122441437927282, + "grad_norm": 0.430397629737854, "learning_rate": 0.0002, - "loss": 0.3023, + "loss": 0.3077, "step": 5020 }, { - "epoch": 0.5736888534759967, - "grad_norm": 0.517444372177124, + "epoch": 0.4138865507401096, + "grad_norm": 0.5038132667541504, "learning_rate": 0.0002, - "loss": 0.2876, + "loss": 0.3065, "step": 5040 }, { - "epoch": 0.5759653965453458, - "grad_norm": 0.5197298526763916, + "epoch": 0.41552895768749104, + "grad_norm": 0.41420304775238037, "learning_rate": 0.0002, - "loss": 0.304, + "loss": 0.3061, "step": 5060 }, { - "epoch": 0.578241939614695, - "grad_norm": 0.3452152907848358, + "epoch": 0.4171713646348724, + "grad_norm": 0.6602872610092163, "learning_rate": 0.0002, - "loss": 0.2794, + "loss": 0.3101, "step": 5080 }, { - "epoch": 0.5805184826840443, - "grad_norm": 0.5630306601524353, + "epoch": 0.4188137715822538, + "grad_norm": 0.46677547693252563, "learning_rate": 0.0002, - "loss": 0.2979, + "loss": 0.3097, "step": 5100 }, { - "epoch": 0.5827950257533935, - "grad_norm": 0.5696737170219421, + "epoch": 0.4204561785296352, + "grad_norm": 0.5312944054603577, "learning_rate": 0.0002, - "loss": 0.3035, + "loss": 0.3136, "step": 5120 }, { - "epoch": 0.5850715688227427, - "grad_norm": 0.5024551153182983, + "epoch": 0.42209858547701656, + "grad_norm": 0.4542620778083801, "learning_rate": 0.0002, - "loss": 0.2717, + "loss": 0.3177, "step": 5140 }, { - "epoch": 0.5873481118920919, - "grad_norm": 0.4166383147239685, + "epoch": 0.42374099242439794, + "grad_norm": 0.5240755081176758, "learning_rate": 0.0002, - "loss": 0.3065, + "loss": 0.3121, "step": 5160 }, { - "epoch": 0.589624654961441, - "grad_norm": 0.36780408024787903, + "epoch": 0.4253833993717793, + "grad_norm": 0.49393558502197266, "learning_rate": 0.0002, - "loss": 0.2864, + "loss": 0.3145, "step": 5180 }, { - "epoch": 0.5919011980307902, - "grad_norm": 0.436526894569397, + "epoch": 0.4270258063191607, + "grad_norm": 0.3480128347873688, "learning_rate": 0.0002, - "loss": 0.2764, + "loss": 0.3047, "step": 5200 }, { - "epoch": 0.5941777411001394, - "grad_norm": 0.43115249276161194, + "epoch": 0.42866821326654214, + "grad_norm": 0.4269355833530426, "learning_rate": 0.0002, - "loss": 0.2791, + "loss": 0.3128, "step": 5220 }, { - "epoch": 0.5964542841694886, - "grad_norm": 0.359739750623703, + "epoch": 0.4303106202139235, + "grad_norm": 0.46620428562164307, "learning_rate": 0.0002, - "loss": 0.3108, + "loss": 0.2892, "step": 5240 }, { - "epoch": 0.5987308272388379, - "grad_norm": 0.4555259644985199, + "epoch": 0.4319530271613049, + "grad_norm": 0.502040684223175, "learning_rate": 0.0002, - "loss": 0.2623, + "loss": 0.2977, "step": 5260 }, { - "epoch": 0.6010073703081871, - "grad_norm": 0.4587076008319855, + "epoch": 0.4335954341086863, + "grad_norm": 0.4725840091705322, "learning_rate": 0.0002, - "loss": 0.293, + "loss": 0.2926, "step": 5280 }, { - "epoch": 0.6032839133775362, - "grad_norm": 0.5236973166465759, + "epoch": 0.43523784105606766, + "grad_norm": 0.4031844735145569, "learning_rate": 0.0002, - "loss": 0.2888, + "loss": 0.2931, "step": 5300 }, { - "epoch": 0.6055604564468854, - "grad_norm": 0.46685513854026794, + "epoch": 0.43688024800344905, + "grad_norm": 0.5044718384742737, "learning_rate": 0.0002, - "loss": 0.2731, + "loss": 0.2925, "step": 5320 }, { - "epoch": 0.6078369995162346, - "grad_norm": 0.5701884627342224, + "epoch": 0.4385226549508304, + "grad_norm": 0.43350791931152344, "learning_rate": 0.0002, - "loss": 0.28, + "loss": 0.3064, "step": 5340 }, { - "epoch": 0.6101135425855838, - "grad_norm": 0.5002717971801758, + "epoch": 0.4401650618982118, + "grad_norm": 0.4503776431083679, "learning_rate": 0.0002, - "loss": 0.2777, + "loss": 0.2935, "step": 5360 }, { - "epoch": 0.612390085654933, - "grad_norm": 0.5896885395050049, + "epoch": 0.44180746884559324, + "grad_norm": 0.4562300145626068, "learning_rate": 0.0002, - "loss": 0.3048, + "loss": 0.2908, "step": 5380 }, { - "epoch": 0.6146666287242821, - "grad_norm": 0.49014943838119507, + "epoch": 0.4434498757929746, + "grad_norm": 0.4543699026107788, "learning_rate": 0.0002, - "loss": 0.2642, + "loss": 0.2971, "step": 5400 }, { - "epoch": 0.6169431717936313, - "grad_norm": 0.5924846529960632, + "epoch": 0.445092282740356, + "grad_norm": 0.45582354068756104, "learning_rate": 0.0002, - "loss": 0.2943, + "loss": 0.3039, "step": 5420 }, { - "epoch": 0.6192197148629806, - "grad_norm": 0.49827829003334045, + "epoch": 0.4467346896877374, + "grad_norm": 0.535355269908905, "learning_rate": 0.0002, - "loss": 0.2879, + "loss": 0.3023, "step": 5440 }, { - "epoch": 0.6214962579323298, - "grad_norm": 0.45312178134918213, + "epoch": 0.44837709663511877, + "grad_norm": 0.6104617118835449, "learning_rate": 0.0002, - "loss": 0.2728, + "loss": 0.3001, "step": 5460 }, { - "epoch": 0.623772801001679, - "grad_norm": 0.3595191538333893, + "epoch": 0.45001950358250015, + "grad_norm": 0.5111253261566162, "learning_rate": 0.0002, - "loss": 0.2713, + "loss": 0.281, "step": 5480 }, { - "epoch": 0.6260493440710282, - "grad_norm": 0.6547619104385376, + "epoch": 0.4516619105298815, + "grad_norm": 0.49691838026046753, "learning_rate": 0.0002, - "loss": 0.2855, + "loss": 0.3043, "step": 5500 }, { - "epoch": 0.6283258871403773, - "grad_norm": 0.4659534692764282, + "epoch": 0.4533043174772629, + "grad_norm": 0.5030774474143982, "learning_rate": 0.0002, - "loss": 0.2908, + "loss": 0.2963, "step": 5520 }, { - "epoch": 0.6306024302097265, - "grad_norm": 0.4027460813522339, + "epoch": 0.4549467244246443, + "grad_norm": 0.4874095320701599, "learning_rate": 0.0002, - "loss": 0.2651, + "loss": 0.3063, "step": 5540 }, { - "epoch": 0.6328789732790757, - "grad_norm": 0.36129653453826904, + "epoch": 0.4565891313720257, + "grad_norm": 0.4713788330554962, "learning_rate": 0.0002, - "loss": 0.2915, + "loss": 0.2997, "step": 5560 }, { - "epoch": 0.6351555163484249, - "grad_norm": 0.5963912010192871, + "epoch": 0.4582315383194071, + "grad_norm": 0.48497167229652405, "learning_rate": 0.0002, - "loss": 0.2968, + "loss": 0.2936, "step": 5580 }, { - "epoch": 0.6374320594177741, - "grad_norm": 0.49669450521469116, + "epoch": 0.4598739452667885, + "grad_norm": 0.5291727185249329, "learning_rate": 0.0002, - "loss": 0.2965, + "loss": 0.2863, "step": 5600 }, { - "epoch": 0.6397086024871234, - "grad_norm": 0.5784302353858948, + "epoch": 0.46151635221416987, + "grad_norm": 0.5845544934272766, "learning_rate": 0.0002, - "loss": 0.2626, + "loss": 0.2834, "step": 5620 }, { - "epoch": 0.6419851455564725, - "grad_norm": 0.5651645660400391, + "epoch": 0.46315875916155125, + "grad_norm": 0.5052700638771057, "learning_rate": 0.0002, - "loss": 0.2738, + "loss": 0.281, "step": 5640 }, { - "epoch": 0.6442616886258217, - "grad_norm": 0.45475292205810547, + "epoch": 0.46480116610893263, + "grad_norm": 0.47813382744789124, "learning_rate": 0.0002, - "loss": 0.2653, + "loss": 0.2859, "step": 5660 }, { - "epoch": 0.6465382316951709, - "grad_norm": 0.4691898822784424, + "epoch": 0.466443573056314, + "grad_norm": 0.4913572072982788, "learning_rate": 0.0002, - "loss": 0.2634, + "loss": 0.2765, "step": 5680 }, { - "epoch": 0.6488147747645201, - "grad_norm": 0.4604431092739105, + "epoch": 0.4680859800036954, + "grad_norm": 0.5044130086898804, "learning_rate": 0.0002, - "loss": 0.2838, + "loss": 0.3068, "step": 5700 }, { - "epoch": 0.6510913178338693, - "grad_norm": 0.506804883480072, + "epoch": 0.4697283869510768, + "grad_norm": 0.45967990159988403, "learning_rate": 0.0002, - "loss": 0.2657, + "loss": 0.294, "step": 5720 }, { - "epoch": 0.6533678609032184, - "grad_norm": 0.5051881670951843, + "epoch": 0.4713707938984582, + "grad_norm": 0.4834402799606323, "learning_rate": 0.0002, - "loss": 0.2976, + "loss": 0.2902, "step": 5740 }, { - "epoch": 0.6556444039725676, - "grad_norm": 0.4780672788619995, + "epoch": 0.4730132008458396, + "grad_norm": 0.4889473617076874, "learning_rate": 0.0002, - "loss": 0.2828, + "loss": 0.2931, "step": 5760 }, { - "epoch": 0.6579209470419168, - "grad_norm": 0.4695095121860504, + "epoch": 0.47465560779322097, + "grad_norm": 0.37159985303878784, "learning_rate": 0.0002, - "loss": 0.2685, + "loss": 0.2836, "step": 5780 }, { - "epoch": 0.660197490111266, - "grad_norm": 0.4259052276611328, + "epoch": 0.47629801474060235, + "grad_norm": 0.44428759813308716, "learning_rate": 0.0002, - "loss": 0.2635, + "loss": 0.2994, "step": 5800 }, { - "epoch": 0.6624740331806153, - "grad_norm": 0.5684182643890381, + "epoch": 0.47794042168798373, + "grad_norm": 0.5093443989753723, "learning_rate": 0.0002, - "loss": 0.2879, + "loss": 0.2943, "step": 5820 }, { - "epoch": 0.6647505762499645, - "grad_norm": 0.42193594574928284, + "epoch": 0.4795828286353651, + "grad_norm": 0.539089024066925, "learning_rate": 0.0002, - "loss": 0.2678, + "loss": 0.2968, "step": 5840 }, { - "epoch": 0.6670271193193136, - "grad_norm": 0.5095034241676331, + "epoch": 0.4812252355827465, + "grad_norm": 0.33726248145103455, "learning_rate": 0.0002, - "loss": 0.2677, + "loss": 0.283, "step": 5860 }, { - "epoch": 0.6693036623886628, - "grad_norm": 0.46626052260398865, + "epoch": 0.4828676425301279, + "grad_norm": 0.451824426651001, "learning_rate": 0.0002, - "loss": 0.2906, + "loss": 0.2824, "step": 5880 }, { - "epoch": 0.671580205458012, - "grad_norm": 0.5086765289306641, + "epoch": 0.4845100494775093, + "grad_norm": 0.4333132207393646, "learning_rate": 0.0002, - "loss": 0.2775, + "loss": 0.2908, "step": 5900 }, { - "epoch": 0.6738567485273612, - "grad_norm": 0.44444966316223145, + "epoch": 0.4861524564248907, + "grad_norm": 0.4399010241031647, "learning_rate": 0.0002, - "loss": 0.2764, + "loss": 0.2857, "step": 5920 }, { - "epoch": 0.6761332915967104, - "grad_norm": 0.4477381706237793, + "epoch": 0.48779486337227207, + "grad_norm": 0.46633288264274597, "learning_rate": 0.0002, - "loss": 0.2729, + "loss": 0.2796, "step": 5940 }, { - "epoch": 0.6784098346660596, - "grad_norm": 0.46984028816223145, + "epoch": 0.48943727031965345, + "grad_norm": 0.6088176965713501, "learning_rate": 0.0002, - "loss": 0.273, + "loss": 0.2868, "step": 5960 }, { - "epoch": 0.6806863777354087, - "grad_norm": 0.417084276676178, + "epoch": 0.49107967726703483, + "grad_norm": 0.5191177129745483, "learning_rate": 0.0002, - "loss": 0.2744, + "loss": 0.2713, "step": 5980 }, { - "epoch": 0.682962920804758, - "grad_norm": 0.4144213795661926, + "epoch": 0.4927220842144162, + "grad_norm": 0.6080117225646973, "learning_rate": 0.0002, - "loss": 0.2704, + "loss": 0.2925, "step": 6000 }, { - "epoch": 0.6852394638741072, - "grad_norm": 0.5844799876213074, + "epoch": 0.4943644911617976, + "grad_norm": 0.4405871629714966, "learning_rate": 0.0002, - "loss": 0.2635, + "loss": 0.2827, "step": 6020 }, { - "epoch": 0.6875160069434564, - "grad_norm": 0.39512693881988525, + "epoch": 0.49600689810917903, + "grad_norm": 0.44443821907043457, "learning_rate": 0.0002, - "loss": 0.2471, + "loss": 0.2641, "step": 6040 }, { - "epoch": 0.6897925500128056, - "grad_norm": 0.5299990773200989, + "epoch": 0.4976493050565604, + "grad_norm": 0.401265025138855, "learning_rate": 0.0002, - "loss": 0.2648, + "loss": 0.2908, "step": 6060 }, { - "epoch": 0.6920690930821547, - "grad_norm": 0.4980265498161316, + "epoch": 0.4992917120039418, + "grad_norm": 0.4125641882419586, "learning_rate": 0.0002, - "loss": 0.2725, + "loss": 0.2717, "step": 6080 }, { - "epoch": 0.6943456361515039, - "grad_norm": 0.4003869891166687, + "epoch": 0.5009341189513231, + "grad_norm": 0.4346245229244232, "learning_rate": 0.0002, - "loss": 0.2768, + "loss": 0.2706, "step": 6100 }, { - "epoch": 0.6966221792208531, - "grad_norm": 0.5103460550308228, + "epoch": 0.5025765258987046, + "grad_norm": 0.47208690643310547, "learning_rate": 0.0002, - "loss": 0.2638, + "loss": 0.2851, "step": 6120 }, { - "epoch": 0.6988987222902023, - "grad_norm": 0.737101137638092, + "epoch": 0.504218932846086, + "grad_norm": 0.4369046986103058, "learning_rate": 0.0002, - "loss": 0.2779, + "loss": 0.2809, "step": 6140 }, { - "epoch": 0.7011752653595515, - "grad_norm": 0.4731826186180115, + "epoch": 0.5058613397934674, + "grad_norm": 0.5451960563659668, "learning_rate": 0.0002, - "loss": 0.2691, + "loss": 0.293, "step": 6160 }, { - "epoch": 0.7034518084289008, - "grad_norm": 0.5234053730964661, + "epoch": 0.5075037467408487, + "grad_norm": 0.6085506677627563, "learning_rate": 0.0002, - "loss": 0.2739, + "loss": 0.2748, "step": 6180 }, { - "epoch": 0.7057283514982499, - "grad_norm": 0.5235525369644165, + "epoch": 0.5091461536882301, + "grad_norm": 0.3898778259754181, "learning_rate": 0.0002, - "loss": 0.2754, + "loss": 0.276, "step": 6200 }, { - "epoch": 0.7080048945675991, - "grad_norm": 0.4453619122505188, + "epoch": 0.5107885606356115, + "grad_norm": 0.5069212317466736, "learning_rate": 0.0002, - "loss": 0.2833, + "loss": 0.2925, "step": 6220 }, { - "epoch": 0.7102814376369483, - "grad_norm": 0.4025666117668152, + "epoch": 0.5124309675829929, + "grad_norm": 0.48736870288848877, "learning_rate": 0.0002, - "loss": 0.2713, + "loss": 0.2718, "step": 6240 }, { - "epoch": 0.7125579807062975, - "grad_norm": 0.35240331292152405, + "epoch": 0.5140733745303743, + "grad_norm": 0.5182287693023682, "learning_rate": 0.0002, - "loss": 0.2786, + "loss": 0.2783, "step": 6260 }, { - "epoch": 0.7148345237756467, - "grad_norm": 0.4521905779838562, + "epoch": 0.5157157814777557, + "grad_norm": 0.5157051086425781, "learning_rate": 0.0002, - "loss": 0.2639, + "loss": 0.2828, "step": 6280 }, { - "epoch": 0.7171110668449959, - "grad_norm": 0.5230519771575928, + "epoch": 0.517358188425137, + "grad_norm": 0.4653798043727875, "learning_rate": 0.0002, - "loss": 0.2517, + "loss": 0.2802, "step": 6300 }, { - "epoch": 0.719387609914345, - "grad_norm": 0.5415637493133545, + "epoch": 0.5190005953725184, + "grad_norm": 0.4838721454143524, "learning_rate": 0.0002, - "loss": 0.2739, + "loss": 0.2758, "step": 6320 }, { - "epoch": 0.7216641529836942, - "grad_norm": 0.4067966341972351, + "epoch": 0.5206430023198998, + "grad_norm": 0.47830331325531006, "learning_rate": 0.0002, - "loss": 0.2751, + "loss": 0.2999, "step": 6340 }, { - "epoch": 0.7239406960530435, - "grad_norm": 0.4670214354991913, + "epoch": 0.5222854092672812, + "grad_norm": 0.45021089911460876, "learning_rate": 0.0002, - "loss": 0.2644, + "loss": 0.2673, "step": 6360 }, { - "epoch": 0.7262172391223927, - "grad_norm": 0.5316203236579895, + "epoch": 0.5239278162146626, + "grad_norm": 0.4527071714401245, "learning_rate": 0.0002, - "loss": 0.2746, + "loss": 0.2624, "step": 6380 }, { - "epoch": 0.7284937821917419, - "grad_norm": 0.46312493085861206, + "epoch": 0.5255702231620439, + "grad_norm": 0.508590817451477, "learning_rate": 0.0002, - "loss": 0.2539, + "loss": 0.2555, "step": 6400 }, { - "epoch": 0.730770325261091, - "grad_norm": 0.465279221534729, + "epoch": 0.5272126301094253, + "grad_norm": 0.38745129108428955, "learning_rate": 0.0002, - "loss": 0.2742, + "loss": 0.2863, "step": 6420 }, { - "epoch": 0.7330468683304402, - "grad_norm": 0.5096962451934814, + "epoch": 0.5288550370568067, + "grad_norm": 0.6669766902923584, "learning_rate": 0.0002, - "loss": 0.2546, + "loss": 0.2813, "step": 6440 }, { - "epoch": 0.7353234113997894, - "grad_norm": 0.4525590240955353, + "epoch": 0.5304974440041882, + "grad_norm": 0.5111877918243408, "learning_rate": 0.0002, - "loss": 0.2694, + "loss": 0.2712, "step": 6460 }, { - "epoch": 0.7375999544691386, - "grad_norm": 0.5033881664276123, + "epoch": 0.5321398509515696, + "grad_norm": 0.5499460697174072, "learning_rate": 0.0002, - "loss": 0.2627, + "loss": 0.2656, "step": 6480 }, { - "epoch": 0.7398764975384878, - "grad_norm": 0.44053900241851807, + "epoch": 0.533782257898951, + "grad_norm": 0.5004873275756836, "learning_rate": 0.0002, - "loss": 0.258, + "loss": 0.2873, "step": 6500 }, { - "epoch": 0.742153040607837, - "grad_norm": 0.4677462875843048, + "epoch": 0.5354246648463323, + "grad_norm": 0.6010814309120178, "learning_rate": 0.0002, - "loss": 0.2659, + "loss": 0.3005, "step": 6520 }, { - "epoch": 0.7444295836771861, - "grad_norm": 0.5687553882598877, + "epoch": 0.5370670717937137, + "grad_norm": 0.4720690846443176, "learning_rate": 0.0002, - "loss": 0.271, + "loss": 0.2675, "step": 6540 }, { - "epoch": 0.7467061267465354, - "grad_norm": 0.4980468451976776, + "epoch": 0.5387094787410951, + "grad_norm": 0.47902727127075195, "learning_rate": 0.0002, - "loss": 0.265, + "loss": 0.2715, "step": 6560 }, { - "epoch": 0.7489826698158846, - "grad_norm": 0.5155619382858276, + "epoch": 0.5403518856884765, + "grad_norm": 0.46664199233055115, "learning_rate": 0.0002, - "loss": 0.2491, + "loss": 0.2713, "step": 6580 }, { - "epoch": 0.7512592128852338, - "grad_norm": 0.5364673733711243, + "epoch": 0.5419942926358579, + "grad_norm": 0.5385149121284485, "learning_rate": 0.0002, - "loss": 0.2564, + "loss": 0.2867, "step": 6600 }, { - "epoch": 0.753535755954583, - "grad_norm": 0.421838641166687, + "epoch": 0.5436366995832392, + "grad_norm": 0.3878926932811737, "learning_rate": 0.0002, - "loss": 0.267, + "loss": 0.2802, "step": 6620 }, { - "epoch": 0.7558122990239322, - "grad_norm": 0.46299833059310913, + "epoch": 0.5452791065306206, + "grad_norm": 0.390656054019928, "learning_rate": 0.0002, - "loss": 0.2461, + "loss": 0.2676, "step": 6640 }, { - "epoch": 0.7580888420932813, - "grad_norm": 0.3832832872867584, + "epoch": 0.546921513478002, + "grad_norm": 0.4342198669910431, "learning_rate": 0.0002, - "loss": 0.265, + "loss": 0.2874, "step": 6660 }, { - "epoch": 0.7603653851626305, - "grad_norm": 0.5560947060585022, + "epoch": 0.5485639204253834, + "grad_norm": 0.42557764053344727, "learning_rate": 0.0002, - "loss": 0.253, + "loss": 0.2829, "step": 6680 }, { - "epoch": 0.7626419282319797, - "grad_norm": 0.4832628667354584, + "epoch": 0.5502063273727648, + "grad_norm": 0.5569108128547668, "learning_rate": 0.0002, - "loss": 0.2515, + "loss": 0.2929, "step": 6700 }, { - "epoch": 0.764918471301329, - "grad_norm": 0.44354599714279175, + "epoch": 0.5518487343201461, + "grad_norm": 0.38765788078308105, "learning_rate": 0.0002, - "loss": 0.2687, + "loss": 0.2804, "step": 6720 }, { - "epoch": 0.7671950143706782, - "grad_norm": 0.3746070861816406, + "epoch": 0.5534911412675275, + "grad_norm": 0.5068329572677612, "learning_rate": 0.0002, - "loss": 0.2481, + "loss": 0.2629, "step": 6740 }, { - "epoch": 0.7694715574400273, - "grad_norm": 0.3048388659954071, + "epoch": 0.5551335482149089, + "grad_norm": 0.5097832083702087, "learning_rate": 0.0002, - "loss": 0.269, + "loss": 0.2846, "step": 6760 }, { - "epoch": 0.7717481005093765, - "grad_norm": 0.46471843123435974, + "epoch": 0.5567759551622903, + "grad_norm": 0.37154141068458557, "learning_rate": 0.0002, - "loss": 0.2642, + "loss": 0.2625, "step": 6780 }, { - "epoch": 0.7740246435787257, - "grad_norm": 0.44309428334236145, + "epoch": 0.5584183621096718, + "grad_norm": 0.41640445590019226, "learning_rate": 0.0002, - "loss": 0.2565, + "loss": 0.2669, "step": 6800 }, { - "epoch": 0.7763011866480749, - "grad_norm": 0.4174291789531708, + "epoch": 0.5600607690570532, + "grad_norm": 0.45431575179100037, "learning_rate": 0.0002, - "loss": 0.262, + "loss": 0.2644, "step": 6820 }, { - "epoch": 0.7785777297174241, - "grad_norm": 0.42592549324035645, + "epoch": 0.5617031760044345, + "grad_norm": 0.46759283542633057, "learning_rate": 0.0002, - "loss": 0.2608, + "loss": 0.2742, "step": 6840 }, { - "epoch": 0.7808542727867733, - "grad_norm": 0.4378054141998291, + "epoch": 0.5633455829518159, + "grad_norm": 0.4959569275379181, "learning_rate": 0.0002, - "loss": 0.2765, + "loss": 0.2746, "step": 6860 }, { - "epoch": 0.7831308158561224, - "grad_norm": 0.4560708701610565, + "epoch": 0.5649879898991973, + "grad_norm": 0.44646400213241577, "learning_rate": 0.0002, - "loss": 0.2381, + "loss": 0.2803, "step": 6880 }, { - "epoch": 0.7854073589254716, - "grad_norm": 0.4595545828342438, + "epoch": 0.5666303968465787, + "grad_norm": 0.5323026180267334, "learning_rate": 0.0002, - "loss": 0.2561, + "loss": 0.2685, "step": 6900 }, { - "epoch": 0.7876839019948209, - "grad_norm": 0.45213592052459717, + "epoch": 0.5682728037939601, + "grad_norm": 0.5455038547515869, "learning_rate": 0.0002, - "loss": 0.2645, + "loss": 0.2737, "step": 6920 }, { - "epoch": 0.7899604450641701, - "grad_norm": 0.4857342839241028, + "epoch": 0.5699152107413414, + "grad_norm": 0.429975301027298, "learning_rate": 0.0002, - "loss": 0.2687, + "loss": 0.2826, "step": 6940 }, { - "epoch": 0.7922369881335193, - "grad_norm": 0.4939437508583069, + "epoch": 0.5715576176887228, + "grad_norm": 0.5396720170974731, "learning_rate": 0.0002, - "loss": 0.2642, + "loss": 0.266, "step": 6960 }, { - "epoch": 0.7945135312028685, - "grad_norm": 0.46244382858276367, + "epoch": 0.5732000246361042, + "grad_norm": 0.45468002557754517, "learning_rate": 0.0002, - "loss": 0.2536, + "loss": 0.2676, "step": 6980 }, { - "epoch": 0.7967900742722176, - "grad_norm": 0.5876993536949158, + "epoch": 0.5748424315834856, + "grad_norm": 0.4196678698062897, "learning_rate": 0.0002, - "loss": 0.2492, + "loss": 0.2786, "step": 7000 }, { - "epoch": 0.7990666173415668, - "grad_norm": 0.5170072913169861, + "epoch": 0.576484838530867, + "grad_norm": 0.4681088328361511, "learning_rate": 0.0002, - "loss": 0.2548, + "loss": 0.2731, "step": 7020 }, { - "epoch": 0.801343160410916, - "grad_norm": 0.394380658864975, + "epoch": 0.5781272454782483, + "grad_norm": 0.4538247287273407, "learning_rate": 0.0002, - "loss": 0.2524, + "loss": 0.287, "step": 7040 }, { - "epoch": 0.8036197034802652, - "grad_norm": 0.4716455340385437, + "epoch": 0.5797696524256297, + "grad_norm": 0.4834930896759033, "learning_rate": 0.0002, - "loss": 0.2573, + "loss": 0.2808, "step": 7060 }, { - "epoch": 0.8058962465496144, - "grad_norm": 0.34525179862976074, + "epoch": 0.5814120593730111, + "grad_norm": 0.5876035690307617, "learning_rate": 0.0002, - "loss": 0.246, + "loss": 0.2631, "step": 7080 }, { - "epoch": 0.8081727896189635, - "grad_norm": 0.5030418038368225, + "epoch": 0.5830544663203925, + "grad_norm": 0.5164270401000977, "learning_rate": 0.0002, - "loss": 0.2596, + "loss": 0.2502, "step": 7100 }, { - "epoch": 0.8104493326883128, - "grad_norm": 0.5586132407188416, + "epoch": 0.584696873267774, + "grad_norm": 0.46229973435401917, "learning_rate": 0.0002, - "loss": 0.2568, + "loss": 0.2575, "step": 7120 }, { - "epoch": 0.812725875757662, - "grad_norm": 0.47025129199028015, + "epoch": 0.5863392802151554, + "grad_norm": 0.438803106546402, "learning_rate": 0.0002, - "loss": 0.265, + "loss": 0.2625, "step": 7140 }, { - "epoch": 0.8150024188270112, - "grad_norm": 0.5654832720756531, + "epoch": 0.5879816871625367, + "grad_norm": 0.5476749539375305, "learning_rate": 0.0002, - "loss": 0.2468, + "loss": 0.2706, "step": 7160 }, { - "epoch": 0.8172789618963604, - "grad_norm": 0.4701017141342163, + "epoch": 0.5896240941099181, + "grad_norm": 0.5194425582885742, "learning_rate": 0.0002, - "loss": 0.2538, + "loss": 0.2766, "step": 7180 }, { - "epoch": 0.8195555049657096, - "grad_norm": 0.47270438075065613, + "epoch": 0.5912665010572995, + "grad_norm": 0.4764098525047302, "learning_rate": 0.0002, - "loss": 0.2529, + "loss": 0.2784, "step": 7200 }, { - "epoch": 0.8218320480350587, - "grad_norm": 0.39433714747428894, + "epoch": 0.5929089080046809, + "grad_norm": 0.4703931510448456, "learning_rate": 0.0002, - "loss": 0.2445, + "loss": 0.2652, "step": 7220 }, { - "epoch": 0.8241085911044079, - "grad_norm": 0.4521467685699463, + "epoch": 0.5945513149520623, + "grad_norm": 0.43372678756713867, "learning_rate": 0.0002, - "loss": 0.2556, + "loss": 0.2644, "step": 7240 }, { - "epoch": 0.8263851341737571, - "grad_norm": 0.28483667969703674, + "epoch": 0.5961937218994436, + "grad_norm": 0.40813469886779785, "learning_rate": 0.0002, - "loss": 0.2451, + "loss": 0.2721, "step": 7260 }, { - "epoch": 0.8286616772431064, - "grad_norm": 0.4298310875892639, + "epoch": 0.597836128846825, + "grad_norm": 0.5182124376296997, "learning_rate": 0.0002, - "loss": 0.2599, + "loss": 0.2741, "step": 7280 }, { - "epoch": 0.8309382203124556, - "grad_norm": 0.39677906036376953, + "epoch": 0.5994785357942064, + "grad_norm": 0.4767136573791504, "learning_rate": 0.0002, - "loss": 0.2539, + "loss": 0.277, "step": 7300 }, { - "epoch": 0.8332147633818048, - "grad_norm": 0.5800175666809082, + "epoch": 0.6011209427415878, + "grad_norm": 0.43762916326522827, "learning_rate": 0.0002, - "loss": 0.2463, + "loss": 0.2645, "step": 7320 }, { - "epoch": 0.8354913064511539, - "grad_norm": 0.42742472887039185, + "epoch": 0.6027633496889692, + "grad_norm": 0.44736623764038086, "learning_rate": 0.0002, - "loss": 0.2593, + "loss": 0.2639, "step": 7340 }, { - "epoch": 0.8377678495205031, - "grad_norm": 0.5521807670593262, + "epoch": 0.6044057566363505, + "grad_norm": 0.44404810667037964, "learning_rate": 0.0002, - "loss": 0.253, + "loss": 0.269, "step": 7360 }, { - "epoch": 0.8400443925898523, - "grad_norm": 0.5068047046661377, + "epoch": 0.6060481635837319, + "grad_norm": 0.4380868673324585, "learning_rate": 0.0002, - "loss": 0.2503, + "loss": 0.2615, "step": 7380 }, { - "epoch": 0.8423209356592015, - "grad_norm": 0.4325120151042938, + "epoch": 0.6076905705311133, + "grad_norm": 0.4491208791732788, "learning_rate": 0.0002, - "loss": 0.2466, + "loss": 0.2462, "step": 7400 }, { - "epoch": 0.8445974787285507, - "grad_norm": 0.5130394101142883, + "epoch": 0.6093329774784947, + "grad_norm": 0.5080710053443909, "learning_rate": 0.0002, - "loss": 0.2521, + "loss": 0.2823, "step": 7420 }, { - "epoch": 0.8468740217978998, - "grad_norm": 0.5091120600700378, + "epoch": 0.6109753844258761, + "grad_norm": 0.47498422861099243, "learning_rate": 0.0002, - "loss": 0.2429, + "loss": 0.2706, "step": 7440 }, { - "epoch": 0.849150564867249, - "grad_norm": 0.4635036289691925, + "epoch": 0.6126177913732576, + "grad_norm": 0.4133289158344269, "learning_rate": 0.0002, - "loss": 0.235, + "loss": 0.2684, "step": 7460 }, { - "epoch": 0.8514271079365983, - "grad_norm": 0.3827108144760132, + "epoch": 0.6142601983206389, + "grad_norm": 0.4456469416618347, "learning_rate": 0.0002, - "loss": 0.2487, + "loss": 0.2542, "step": 7480 }, { - "epoch": 0.8537036510059475, - "grad_norm": 0.3880899250507355, + "epoch": 0.6159026052680203, + "grad_norm": 0.5421611070632935, "learning_rate": 0.0002, - "loss": 0.2469, + "loss": 0.2737, "step": 7500 }, { - "epoch": 0.8559801940752967, - "grad_norm": 0.408933162689209, + "epoch": 0.6175450122154017, + "grad_norm": 0.4131532609462738, "learning_rate": 0.0002, - "loss": 0.2499, + "loss": 0.2507, "step": 7520 }, { - "epoch": 0.8582567371446459, - "grad_norm": 0.5049706101417542, + "epoch": 0.6191874191627831, + "grad_norm": 0.47127702832221985, "learning_rate": 0.0002, - "loss": 0.2418, + "loss": 0.2819, "step": 7540 }, { - "epoch": 0.860533280213995, - "grad_norm": 0.43551701307296753, + "epoch": 0.6208298261101645, + "grad_norm": 0.43743231892585754, "learning_rate": 0.0002, - "loss": 0.2478, + "loss": 0.2822, "step": 7560 }, { - "epoch": 0.8628098232833442, - "grad_norm": 0.5024411678314209, + "epoch": 0.6224722330575458, + "grad_norm": 0.42425501346588135, "learning_rate": 0.0002, - "loss": 0.2538, + "loss": 0.2654, "step": 7580 }, { - "epoch": 0.8650863663526934, - "grad_norm": 0.36361223459243774, + "epoch": 0.6241146400049272, + "grad_norm": 0.4609832763671875, "learning_rate": 0.0002, - "loss": 0.2536, + "loss": 0.2466, "step": 7600 }, { - "epoch": 0.8673629094220426, - "grad_norm": 0.4526277482509613, + "epoch": 0.6257570469523086, + "grad_norm": 0.42701244354248047, "learning_rate": 0.0002, - "loss": 0.242, + "loss": 0.255, "step": 7620 }, { - "epoch": 0.8696394524913919, - "grad_norm": 0.5677676200866699, + "epoch": 0.62739945389969, + "grad_norm": 0.5154401063919067, "learning_rate": 0.0002, - "loss": 0.2572, + "loss": 0.2705, "step": 7640 }, { - "epoch": 0.8719159955607411, - "grad_norm": 0.4915711283683777, + "epoch": 0.6290418608470714, + "grad_norm": 0.451377809047699, "learning_rate": 0.0002, - "loss": 0.2562, + "loss": 0.2586, "step": 7660 }, { - "epoch": 0.8741925386300902, - "grad_norm": 0.36850452423095703, + "epoch": 0.6306842677944527, + "grad_norm": 0.47166112065315247, "learning_rate": 0.0002, - "loss": 0.2523, + "loss": 0.2605, "step": 7680 }, { - "epoch": 0.8764690816994394, - "grad_norm": 0.38313761353492737, + "epoch": 0.6323266747418341, + "grad_norm": 0.3716096878051758, "learning_rate": 0.0002, - "loss": 0.2596, + "loss": 0.2539, "step": 7700 }, { - "epoch": 0.8787456247687886, - "grad_norm": 0.5384640097618103, + "epoch": 0.6339690816892155, + "grad_norm": 0.45413604378700256, "learning_rate": 0.0002, - "loss": 0.2455, + "loss": 0.2633, "step": 7720 }, { - "epoch": 0.8810221678381378, - "grad_norm": 0.5308900475502014, + "epoch": 0.6356114886365969, + "grad_norm": 0.48580700159072876, "learning_rate": 0.0002, - "loss": 0.2439, + "loss": 0.256, "step": 7740 }, { - "epoch": 0.883298710907487, - "grad_norm": 0.5488154292106628, + "epoch": 0.6372538955839783, + "grad_norm": 0.40647098422050476, "learning_rate": 0.0002, - "loss": 0.2428, + "loss": 0.2655, "step": 7760 }, { - "epoch": 0.8855752539768362, - "grad_norm": 0.5271242260932922, + "epoch": 0.6388963025313598, + "grad_norm": 0.4718053638935089, "learning_rate": 0.0002, - "loss": 0.2372, + "loss": 0.261, "step": 7780 }, { - "epoch": 0.8878517970461853, - "grad_norm": 0.46171802282333374, + "epoch": 0.6405387094787411, + "grad_norm": 0.5230545401573181, "learning_rate": 0.0002, - "loss": 0.2506, + "loss": 0.2464, "step": 7800 }, { - "epoch": 0.8901283401155345, - "grad_norm": 0.45436665415763855, + "epoch": 0.6421811164261225, + "grad_norm": 0.5010546445846558, "learning_rate": 0.0002, - "loss": 0.2414, + "loss": 0.261, "step": 7820 }, { - "epoch": 0.8924048831848838, - "grad_norm": 0.4920847415924072, + "epoch": 0.6438235233735039, + "grad_norm": 0.41263461112976074, "learning_rate": 0.0002, - "loss": 0.2669, + "loss": 0.2626, "step": 7840 }, { - "epoch": 0.894681426254233, - "grad_norm": 0.5913518071174622, + "epoch": 0.6454659303208853, + "grad_norm": 0.538346529006958, "learning_rate": 0.0002, - "loss": 0.2552, + "loss": 0.2557, "step": 7860 }, { - "epoch": 0.8969579693235822, - "grad_norm": 0.6011972427368164, + "epoch": 0.6471083372682667, + "grad_norm": 0.4800877869129181, "learning_rate": 0.0002, - "loss": 0.2533, + "loss": 0.2742, "step": 7880 }, { - "epoch": 0.8992345123929313, - "grad_norm": 0.4650927186012268, + "epoch": 0.648750744215648, + "grad_norm": 0.5247358083724976, "learning_rate": 0.0002, - "loss": 0.2448, + "loss": 0.2608, "step": 7900 }, { - "epoch": 0.9015110554622805, - "grad_norm": 0.5828790664672852, + "epoch": 0.6503931511630294, + "grad_norm": 0.5625537037849426, "learning_rate": 0.0002, - "loss": 0.2381, + "loss": 0.2445, "step": 7920 }, { - "epoch": 0.9037875985316297, - "grad_norm": 0.5178338885307312, + "epoch": 0.6520355581104108, + "grad_norm": 0.44077080488204956, "learning_rate": 0.0002, - "loss": 0.2619, + "loss": 0.2572, "step": 7940 }, { - "epoch": 0.9060641416009789, - "grad_norm": 0.5147708058357239, + "epoch": 0.6536779650577922, + "grad_norm": 0.4610736072063446, "learning_rate": 0.0002, - "loss": 0.258, + "loss": 0.2645, "step": 7960 }, { - "epoch": 0.9083406846703281, - "grad_norm": 0.45790836215019226, + "epoch": 0.6553203720051736, + "grad_norm": 0.4790017008781433, "learning_rate": 0.0002, - "loss": 0.2474, + "loss": 0.2556, "step": 7980 }, { - "epoch": 0.9106172277396773, - "grad_norm": 0.3837074935436249, + "epoch": 0.656962778952555, + "grad_norm": 0.45367711782455444, "learning_rate": 0.0002, - "loss": 0.2356, + "loss": 0.253, "step": 8000 }, { - "epoch": 0.9128937708090265, - "grad_norm": 0.4466090500354767, + "epoch": 0.6586051858999363, + "grad_norm": 0.4644503593444824, "learning_rate": 0.0002, - "loss": 0.237, + "loss": 0.25, "step": 8020 }, { - "epoch": 0.9151703138783757, - "grad_norm": 0.5893344283103943, + "epoch": 0.6602475928473177, + "grad_norm": 0.3938300311565399, "learning_rate": 0.0002, - "loss": 0.2399, + "loss": 0.2524, "step": 8040 }, { - "epoch": 0.9174468569477249, - "grad_norm": 0.49547362327575684, + "epoch": 0.6618899997946991, + "grad_norm": 0.4796749949455261, "learning_rate": 0.0002, - "loss": 0.2526, + "loss": 0.2643, "step": 8060 }, { - "epoch": 0.9197234000170741, - "grad_norm": 0.47068551182746887, + "epoch": 0.6635324067420805, + "grad_norm": 0.3965921700000763, "learning_rate": 0.0002, - "loss": 0.2631, + "loss": 0.252, "step": 8080 }, { - "epoch": 0.9219999430864233, - "grad_norm": 0.3512951135635376, + "epoch": 0.6651748136894619, + "grad_norm": 0.4033324420452118, "learning_rate": 0.0002, - "loss": 0.2395, + "loss": 0.2469, "step": 8100 }, { - "epoch": 0.9242764861557725, - "grad_norm": 0.3996793031692505, + "epoch": 0.6668172206368433, + "grad_norm": 0.5205174088478088, "learning_rate": 0.0002, - "loss": 0.2424, + "loss": 0.2479, "step": 8120 }, { - "epoch": 0.9265530292251216, - "grad_norm": 0.5782022476196289, + "epoch": 0.6684596275842247, + "grad_norm": 0.4026409685611725, "learning_rate": 0.0002, - "loss": 0.2549, + "loss": 0.2482, "step": 8140 }, { - "epoch": 0.9288295722944708, - "grad_norm": 0.450860857963562, + "epoch": 0.6701020345316061, + "grad_norm": 0.33538395166397095, "learning_rate": 0.0002, - "loss": 0.2465, + "loss": 0.2452, "step": 8160 }, { - "epoch": 0.93110611536382, - "grad_norm": 0.4679816663265228, + "epoch": 0.6717444414789875, + "grad_norm": 0.43549609184265137, "learning_rate": 0.0002, - "loss": 0.2326, + "loss": 0.2548, "step": 8180 }, { - "epoch": 0.9333826584331693, - "grad_norm": 0.5497337579727173, + "epoch": 0.6733868484263689, + "grad_norm": 0.5167241096496582, "learning_rate": 0.0002, - "loss": 0.2457, + "loss": 0.2664, "step": 8200 }, { - "epoch": 0.9356592015025185, - "grad_norm": 0.3775748312473297, + "epoch": 0.6750292553737502, + "grad_norm": 0.4824913740158081, "learning_rate": 0.0002, - "loss": 0.2331, + "loss": 0.2668, "step": 8220 }, { - "epoch": 0.9379357445718676, - "grad_norm": 0.5428327918052673, + "epoch": 0.6766716623211316, + "grad_norm": 0.49560844898223877, "learning_rate": 0.0002, - "loss": 0.2399, + "loss": 0.2639, "step": 8240 }, { - "epoch": 0.9402122876412168, - "grad_norm": 0.4089830219745636, + "epoch": 0.678314069268513, + "grad_norm": 0.43627840280532837, "learning_rate": 0.0002, - "loss": 0.246, + "loss": 0.2536, "step": 8260 }, { - "epoch": 0.942488830710566, - "grad_norm": 0.5781340003013611, + "epoch": 0.6799564762158944, + "grad_norm": 0.4371199905872345, "learning_rate": 0.0002, - "loss": 0.2451, + "loss": 0.259, "step": 8280 }, { - "epoch": 0.9447653737799152, - "grad_norm": 0.5869989395141602, + "epoch": 0.6815988831632758, + "grad_norm": 0.43210867047309875, "learning_rate": 0.0002, - "loss": 0.2541, + "loss": 0.2413, "step": 8300 }, { - "epoch": 0.9470419168492644, - "grad_norm": 0.47708019614219666, + "epoch": 0.6832412901106572, + "grad_norm": 0.4612789750099182, "learning_rate": 0.0002, - "loss": 0.2559, + "loss": 0.257, "step": 8320 }, { - "epoch": 0.9493184599186136, - "grad_norm": 0.5445525050163269, + "epoch": 0.6848836970580385, + "grad_norm": 0.5780384540557861, "learning_rate": 0.0002, - "loss": 0.2466, + "loss": 0.2497, "step": 8340 }, { - "epoch": 0.9515950029879627, - "grad_norm": 0.480214387178421, + "epoch": 0.6865261040054199, + "grad_norm": 0.3581444323062897, "learning_rate": 0.0002, - "loss": 0.236, + "loss": 0.2542, "step": 8360 }, { - "epoch": 0.953871546057312, - "grad_norm": 0.5392053127288818, + "epoch": 0.6881685109528013, + "grad_norm": 0.5276636481285095, "learning_rate": 0.0002, - "loss": 0.2383, + "loss": 0.2482, "step": 8380 }, { - "epoch": 0.9561480891266612, - "grad_norm": 0.4515858292579651, + "epoch": 0.6898109179001827, + "grad_norm": 0.419548362493515, "learning_rate": 0.0002, - "loss": 0.238, + "loss": 0.2778, "step": 8400 }, { - "epoch": 0.9584246321960104, - "grad_norm": 0.5461826324462891, + "epoch": 0.691453324847564, + "grad_norm": 0.5594448447227478, "learning_rate": 0.0002, - "loss": 0.2442, + "loss": 0.271, "step": 8420 }, { - "epoch": 0.9607011752653596, - "grad_norm": 0.44309332966804504, + "epoch": 0.6930957317949455, + "grad_norm": 0.4505052864551544, "learning_rate": 0.0002, - "loss": 0.2622, + "loss": 0.2531, "step": 8440 }, { - "epoch": 0.9629777183347088, - "grad_norm": 0.5409505367279053, + "epoch": 0.6947381387423269, + "grad_norm": 0.4273683726787567, "learning_rate": 0.0002, - "loss": 0.2303, + "loss": 0.2687, "step": 8460 }, { - "epoch": 0.9652542614040579, - "grad_norm": 0.3868342638015747, + "epoch": 0.6963805456897083, + "grad_norm": 0.41312068700790405, "learning_rate": 0.0002, - "loss": 0.2624, + "loss": 0.2535, "step": 8480 }, { - "epoch": 0.9675308044734071, - "grad_norm": 0.38888975977897644, + "epoch": 0.6980229526370897, + "grad_norm": 0.3998921811580658, "learning_rate": 0.0002, - "loss": 0.246, + "loss": 0.2507, "step": 8500 }, { - "epoch": 0.9698073475427563, - "grad_norm": 0.38946032524108887, + "epoch": 0.6996653595844711, + "grad_norm": 0.4063471257686615, "learning_rate": 0.0002, - "loss": 0.2503, + "loss": 0.2604, "step": 8520 }, { - "epoch": 0.9720838906121055, - "grad_norm": 0.42425817251205444, + "epoch": 0.7013077665318525, + "grad_norm": 0.4816170036792755, "learning_rate": 0.0002, - "loss": 0.2556, + "loss": 0.2563, "step": 8540 }, { - "epoch": 0.9743604336814548, - "grad_norm": 0.41515296697616577, + "epoch": 0.7029501734792338, + "grad_norm": 0.47880151867866516, "learning_rate": 0.0002, - "loss": 0.2437, + "loss": 0.2582, "step": 8560 }, { - "epoch": 0.9766369767508039, - "grad_norm": 0.4085826575756073, + "epoch": 0.7045925804266152, + "grad_norm": 0.43934714794158936, "learning_rate": 0.0002, - "loss": 0.2293, + "loss": 0.2588, "step": 8580 }, { - "epoch": 0.9789135198201531, - "grad_norm": 0.3404542803764343, + "epoch": 0.7062349873739966, + "grad_norm": 0.5664840340614319, "learning_rate": 0.0002, - "loss": 0.242, + "loss": 0.2361, "step": 8600 }, { - "epoch": 0.9811900628895023, - "grad_norm": 0.43266579508781433, + "epoch": 0.707877394321378, + "grad_norm": 0.4387499690055847, "learning_rate": 0.0002, - "loss": 0.2513, + "loss": 0.2784, "step": 8620 }, { - "epoch": 0.9834666059588515, - "grad_norm": 0.42724549770355225, + "epoch": 0.7095198012687594, + "grad_norm": 0.4497361183166504, "learning_rate": 0.0002, - "loss": 0.2384, + "loss": 0.2419, "step": 8640 }, { - "epoch": 0.9857431490282007, - "grad_norm": 0.5089221596717834, + "epoch": 0.7111622082161407, + "grad_norm": 0.36037716269493103, "learning_rate": 0.0002, - "loss": 0.2409, + "loss": 0.2479, "step": 8660 }, { - "epoch": 0.9880196920975499, - "grad_norm": 0.519223690032959, + "epoch": 0.7128046151635221, + "grad_norm": 0.5163317918777466, "learning_rate": 0.0002, - "loss": 0.2353, + "loss": 0.2535, "step": 8680 }, { - "epoch": 0.990296235166899, - "grad_norm": 0.5701056122779846, + "epoch": 0.7144470221109035, + "grad_norm": 0.466194748878479, "learning_rate": 0.0002, - "loss": 0.2486, + "loss": 0.2533, "step": 8700 }, { - "epoch": 0.9925727782362482, - "grad_norm": 0.4519595503807068, + "epoch": 0.7160894290582849, + "grad_norm": 0.328848272562027, "learning_rate": 0.0002, - "loss": 0.2374, + "loss": 0.254, "step": 8720 }, { - "epoch": 0.9948493213055974, - "grad_norm": 0.4883946180343628, + "epoch": 0.7177318360056663, + "grad_norm": 0.5417701005935669, "learning_rate": 0.0002, - "loss": 0.2441, + "loss": 0.2544, "step": 8740 }, { - "epoch": 0.9971258643749467, - "grad_norm": 0.6918900012969971, + "epoch": 0.7193742429530476, + "grad_norm": 0.5538254976272583, "learning_rate": 0.0002, - "loss": 0.2403, + "loss": 0.2453, "step": 8760 }, { - "epoch": 0.9994024074442959, - "grad_norm": 0.4810091555118561, + "epoch": 0.7210166499004291, + "grad_norm": 0.4739200174808502, "learning_rate": 0.0002, - "loss": 0.2334, + "loss": 0.258, "step": 8780 }, + { + "epoch": 0.7226590568478105, + "grad_norm": 0.40133044123649597, + "learning_rate": 0.0002, + "loss": 0.2684, + "step": 8800 + }, + { + "epoch": 0.7243014637951919, + "grad_norm": 0.4493289291858673, + "learning_rate": 0.0002, + "loss": 0.2565, + "step": 8820 + }, + { + "epoch": 0.7259438707425733, + "grad_norm": 0.4970559775829315, + "learning_rate": 0.0002, + "loss": 0.2506, + "step": 8840 + }, + { + "epoch": 0.7275862776899547, + "grad_norm": 0.5687580108642578, + "learning_rate": 0.0002, + "loss": 0.2511, + "step": 8860 + }, + { + "epoch": 0.729228684637336, + "grad_norm": 0.5328338742256165, + "learning_rate": 0.0002, + "loss": 0.2428, + "step": 8880 + }, + { + "epoch": 0.7308710915847174, + "grad_norm": 0.47104090452194214, + "learning_rate": 0.0002, + "loss": 0.2491, + "step": 8900 + }, + { + "epoch": 0.7325134985320988, + "grad_norm": 0.4887702167034149, + "learning_rate": 0.0002, + "loss": 0.2532, + "step": 8920 + }, + { + "epoch": 0.7341559054794802, + "grad_norm": 0.3589889705181122, + "learning_rate": 0.0002, + "loss": 0.2587, + "step": 8940 + }, + { + "epoch": 0.7357983124268616, + "grad_norm": 0.4665176570415497, + "learning_rate": 0.0002, + "loss": 0.2407, + "step": 8960 + }, + { + "epoch": 0.7374407193742429, + "grad_norm": 0.2580777108669281, + "learning_rate": 0.0002, + "loss": 0.2501, + "step": 8980 + }, + { + "epoch": 0.7390831263216243, + "grad_norm": 0.5562865734100342, + "learning_rate": 0.0002, + "loss": 0.2589, + "step": 9000 + }, + { + "epoch": 0.7407255332690057, + "grad_norm": 0.36843666434288025, + "learning_rate": 0.0002, + "loss": 0.2639, + "step": 9020 + }, + { + "epoch": 0.7423679402163871, + "grad_norm": 0.433339387178421, + "learning_rate": 0.0002, + "loss": 0.239, + "step": 9040 + }, + { + "epoch": 0.7440103471637685, + "grad_norm": 0.5565098524093628, + "learning_rate": 0.0002, + "loss": 0.2528, + "step": 9060 + }, + { + "epoch": 0.7456527541111498, + "grad_norm": 0.39954161643981934, + "learning_rate": 0.0002, + "loss": 0.24, + "step": 9080 + }, + { + "epoch": 0.7472951610585313, + "grad_norm": 0.43612274527549744, + "learning_rate": 0.0002, + "loss": 0.2373, + "step": 9100 + }, + { + "epoch": 0.7489375680059127, + "grad_norm": 0.4511432945728302, + "learning_rate": 0.0002, + "loss": 0.2564, + "step": 9120 + }, + { + "epoch": 0.7505799749532941, + "grad_norm": 0.3895890414714813, + "learning_rate": 0.0002, + "loss": 0.2469, + "step": 9140 + }, + { + "epoch": 0.7522223819006755, + "grad_norm": 0.4349375069141388, + "learning_rate": 0.0002, + "loss": 0.2582, + "step": 9160 + }, + { + "epoch": 0.7538647888480569, + "grad_norm": 0.39693930745124817, + "learning_rate": 0.0002, + "loss": 0.2576, + "step": 9180 + }, + { + "epoch": 0.7555071957954382, + "grad_norm": 0.35806095600128174, + "learning_rate": 0.0002, + "loss": 0.235, + "step": 9200 + }, + { + "epoch": 0.7571496027428196, + "grad_norm": 0.5650025010108948, + "learning_rate": 0.0002, + "loss": 0.2541, + "step": 9220 + }, + { + "epoch": 0.758792009690201, + "grad_norm": 0.45522645115852356, + "learning_rate": 0.0002, + "loss": 0.2323, + "step": 9240 + }, + { + "epoch": 0.7604344166375824, + "grad_norm": 0.45849525928497314, + "learning_rate": 0.0002, + "loss": 0.2459, + "step": 9260 + }, + { + "epoch": 0.7620768235849638, + "grad_norm": 0.5666941404342651, + "learning_rate": 0.0002, + "loss": 0.2634, + "step": 9280 + }, + { + "epoch": 0.7637192305323451, + "grad_norm": 0.43697381019592285, + "learning_rate": 0.0002, + "loss": 0.2482, + "step": 9300 + }, + { + "epoch": 0.7653616374797265, + "grad_norm": 0.5133718848228455, + "learning_rate": 0.0002, + "loss": 0.2631, + "step": 9320 + }, + { + "epoch": 0.7670040444271079, + "grad_norm": 0.5440112352371216, + "learning_rate": 0.0002, + "loss": 0.2593, + "step": 9340 + }, + { + "epoch": 0.7686464513744893, + "grad_norm": 0.5012624263763428, + "learning_rate": 0.0002, + "loss": 0.243, + "step": 9360 + }, + { + "epoch": 0.7702888583218707, + "grad_norm": 0.4387590289115906, + "learning_rate": 0.0002, + "loss": 0.2448, + "step": 9380 + }, + { + "epoch": 0.771931265269252, + "grad_norm": 0.4327554702758789, + "learning_rate": 0.0002, + "loss": 0.2514, + "step": 9400 + }, + { + "epoch": 0.7735736722166334, + "grad_norm": 0.4909968078136444, + "learning_rate": 0.0002, + "loss": 0.2503, + "step": 9420 + }, + { + "epoch": 0.7752160791640149, + "grad_norm": 0.4279715120792389, + "learning_rate": 0.0002, + "loss": 0.2558, + "step": 9440 + }, + { + "epoch": 0.7768584861113963, + "grad_norm": 0.4973134994506836, + "learning_rate": 0.0002, + "loss": 0.2412, + "step": 9460 + }, + { + "epoch": 0.7785008930587777, + "grad_norm": 0.3873676359653473, + "learning_rate": 0.0002, + "loss": 0.2409, + "step": 9480 + }, + { + "epoch": 0.7801433000061591, + "grad_norm": 0.40915995836257935, + "learning_rate": 0.0002, + "loss": 0.2322, + "step": 9500 + }, + { + "epoch": 0.7817857069535404, + "grad_norm": 0.5738871693611145, + "learning_rate": 0.0002, + "loss": 0.2408, + "step": 9520 + }, + { + "epoch": 0.7834281139009218, + "grad_norm": 0.49270549416542053, + "learning_rate": 0.0002, + "loss": 0.2477, + "step": 9540 + }, + { + "epoch": 0.7850705208483032, + "grad_norm": 0.4603147804737091, + "learning_rate": 0.0002, + "loss": 0.2402, + "step": 9560 + }, + { + "epoch": 0.7867129277956846, + "grad_norm": 0.47675642371177673, + "learning_rate": 0.0002, + "loss": 0.2528, + "step": 9580 + }, + { + "epoch": 0.788355334743066, + "grad_norm": 0.41800156235694885, + "learning_rate": 0.0002, + "loss": 0.2571, + "step": 9600 + }, + { + "epoch": 0.7899977416904473, + "grad_norm": 0.42527106404304504, + "learning_rate": 0.0002, + "loss": 0.2452, + "step": 9620 + }, + { + "epoch": 0.7916401486378287, + "grad_norm": 0.5056847333908081, + "learning_rate": 0.0002, + "loss": 0.2511, + "step": 9640 + }, + { + "epoch": 0.7932825555852101, + "grad_norm": 0.2951577305793762, + "learning_rate": 0.0002, + "loss": 0.233, + "step": 9660 + }, + { + "epoch": 0.7949249625325915, + "grad_norm": 0.4254283010959625, + "learning_rate": 0.0002, + "loss": 0.2474, + "step": 9680 + }, + { + "epoch": 0.7965673694799729, + "grad_norm": 0.5127973556518555, + "learning_rate": 0.0002, + "loss": 0.2655, + "step": 9700 + }, + { + "epoch": 0.7982097764273542, + "grad_norm": 0.3507694900035858, + "learning_rate": 0.0002, + "loss": 0.227, + "step": 9720 + }, + { + "epoch": 0.7998521833747356, + "grad_norm": 0.4255737364292145, + "learning_rate": 0.0002, + "loss": 0.2591, + "step": 9740 + }, + { + "epoch": 0.8014945903221171, + "grad_norm": 0.44822582602500916, + "learning_rate": 0.0002, + "loss": 0.2287, + "step": 9760 + }, + { + "epoch": 0.8031369972694985, + "grad_norm": 0.4737776517868042, + "learning_rate": 0.0002, + "loss": 0.2412, + "step": 9780 + }, + { + "epoch": 0.8047794042168799, + "grad_norm": 0.4281519651412964, + "learning_rate": 0.0002, + "loss": 0.2559, + "step": 9800 + }, + { + "epoch": 0.8064218111642613, + "grad_norm": 0.3413679301738739, + "learning_rate": 0.0002, + "loss": 0.2479, + "step": 9820 + }, + { + "epoch": 0.8080642181116426, + "grad_norm": 0.4361155033111572, + "learning_rate": 0.0002, + "loss": 0.2539, + "step": 9840 + }, + { + "epoch": 0.809706625059024, + "grad_norm": 0.48523005843162537, + "learning_rate": 0.0002, + "loss": 0.2534, + "step": 9860 + }, + { + "epoch": 0.8113490320064054, + "grad_norm": 0.4045993685722351, + "learning_rate": 0.0002, + "loss": 0.2455, + "step": 9880 + }, + { + "epoch": 0.8129914389537868, + "grad_norm": 0.5103000998497009, + "learning_rate": 0.0002, + "loss": 0.2535, + "step": 9900 + }, + { + "epoch": 0.8146338459011682, + "grad_norm": 0.3670307397842407, + "learning_rate": 0.0002, + "loss": 0.2337, + "step": 9920 + }, + { + "epoch": 0.8162762528485495, + "grad_norm": 0.3149369955062866, + "learning_rate": 0.0002, + "loss": 0.2586, + "step": 9940 + }, + { + "epoch": 0.8179186597959309, + "grad_norm": 0.5316740274429321, + "learning_rate": 0.0002, + "loss": 0.2373, + "step": 9960 + }, + { + "epoch": 0.8195610667433123, + "grad_norm": 0.5300164222717285, + "learning_rate": 0.0002, + "loss": 0.2399, + "step": 9980 + }, + { + "epoch": 0.8212034736906937, + "grad_norm": 0.48414990305900574, + "learning_rate": 0.0002, + "loss": 0.2331, + "step": 10000 + }, + { + "epoch": 0.8228458806380751, + "grad_norm": 0.41733840107917786, + "learning_rate": 0.0002, + "loss": 0.2454, + "step": 10020 + }, + { + "epoch": 0.8244882875854564, + "grad_norm": 0.5048840045928955, + "learning_rate": 0.0002, + "loss": 0.2421, + "step": 10040 + }, + { + "epoch": 0.8261306945328378, + "grad_norm": 0.4444895386695862, + "learning_rate": 0.0002, + "loss": 0.2537, + "step": 10060 + }, + { + "epoch": 0.8277731014802192, + "grad_norm": 0.45051780343055725, + "learning_rate": 0.0002, + "loss": 0.2462, + "step": 10080 + }, + { + "epoch": 0.8294155084276007, + "grad_norm": 0.3937041163444519, + "learning_rate": 0.0002, + "loss": 0.243, + "step": 10100 + }, + { + "epoch": 0.8310579153749821, + "grad_norm": 0.45621591806411743, + "learning_rate": 0.0002, + "loss": 0.2469, + "step": 10120 + }, + { + "epoch": 0.8327003223223635, + "grad_norm": 0.5431267619132996, + "learning_rate": 0.0002, + "loss": 0.2425, + "step": 10140 + }, + { + "epoch": 0.8343427292697448, + "grad_norm": 0.5039596557617188, + "learning_rate": 0.0002, + "loss": 0.2379, + "step": 10160 + }, + { + "epoch": 0.8359851362171262, + "grad_norm": 0.3915367126464844, + "learning_rate": 0.0002, + "loss": 0.241, + "step": 10180 + }, + { + "epoch": 0.8376275431645076, + "grad_norm": 0.46073317527770996, + "learning_rate": 0.0002, + "loss": 0.2485, + "step": 10200 + }, + { + "epoch": 0.839269950111889, + "grad_norm": 0.47057440876960754, + "learning_rate": 0.0002, + "loss": 0.2452, + "step": 10220 + }, + { + "epoch": 0.8409123570592704, + "grad_norm": 0.6143821477890015, + "learning_rate": 0.0002, + "loss": 0.2394, + "step": 10240 + }, + { + "epoch": 0.8425547640066517, + "grad_norm": 0.41434940695762634, + "learning_rate": 0.0002, + "loss": 0.2332, + "step": 10260 + }, + { + "epoch": 0.8441971709540331, + "grad_norm": 0.467459499835968, + "learning_rate": 0.0002, + "loss": 0.2439, + "step": 10280 + }, + { + "epoch": 0.8458395779014145, + "grad_norm": 0.49404439330101013, + "learning_rate": 0.0002, + "loss": 0.2378, + "step": 10300 + }, + { + "epoch": 0.8474819848487959, + "grad_norm": 0.4313650131225586, + "learning_rate": 0.0002, + "loss": 0.2455, + "step": 10320 + }, + { + "epoch": 0.8491243917961773, + "grad_norm": 0.34277698397636414, + "learning_rate": 0.0002, + "loss": 0.2396, + "step": 10340 + }, + { + "epoch": 0.8507667987435586, + "grad_norm": 0.3649916350841522, + "learning_rate": 0.0002, + "loss": 0.2348, + "step": 10360 + }, + { + "epoch": 0.85240920569094, + "grad_norm": 0.4841578006744385, + "learning_rate": 0.0002, + "loss": 0.2488, + "step": 10380 + }, + { + "epoch": 0.8540516126383214, + "grad_norm": 0.5488325953483582, + "learning_rate": 0.0002, + "loss": 0.2399, + "step": 10400 + }, + { + "epoch": 0.8556940195857029, + "grad_norm": 0.41103577613830566, + "learning_rate": 0.0002, + "loss": 0.2371, + "step": 10420 + }, + { + "epoch": 0.8573364265330843, + "grad_norm": 0.42253378033638, + "learning_rate": 0.0002, + "loss": 0.2478, + "step": 10440 + }, + { + "epoch": 0.8589788334804657, + "grad_norm": 0.43092676997184753, + "learning_rate": 0.0002, + "loss": 0.2316, + "step": 10460 + }, + { + "epoch": 0.860621240427847, + "grad_norm": 0.5474075078964233, + "learning_rate": 0.0002, + "loss": 0.2734, + "step": 10480 + }, + { + "epoch": 0.8622636473752284, + "grad_norm": 0.474618524312973, + "learning_rate": 0.0002, + "loss": 0.2378, + "step": 10500 + }, + { + "epoch": 0.8639060543226098, + "grad_norm": 0.44008612632751465, + "learning_rate": 0.0002, + "loss": 0.236, + "step": 10520 + }, + { + "epoch": 0.8655484612699912, + "grad_norm": 0.4194040894508362, + "learning_rate": 0.0002, + "loss": 0.2433, + "step": 10540 + }, + { + "epoch": 0.8671908682173726, + "grad_norm": 0.3890872597694397, + "learning_rate": 0.0002, + "loss": 0.2308, + "step": 10560 + }, + { + "epoch": 0.868833275164754, + "grad_norm": 0.41979917883872986, + "learning_rate": 0.0002, + "loss": 0.2417, + "step": 10580 + }, + { + "epoch": 0.8704756821121353, + "grad_norm": 0.3800947666168213, + "learning_rate": 0.0002, + "loss": 0.244, + "step": 10600 + }, + { + "epoch": 0.8721180890595167, + "grad_norm": 0.38609811663627625, + "learning_rate": 0.0002, + "loss": 0.2477, + "step": 10620 + }, + { + "epoch": 0.8737604960068981, + "grad_norm": 0.514067530632019, + "learning_rate": 0.0002, + "loss": 0.2382, + "step": 10640 + }, + { + "epoch": 0.8754029029542795, + "grad_norm": 0.47742265462875366, + "learning_rate": 0.0002, + "loss": 0.2298, + "step": 10660 + }, + { + "epoch": 0.8770453099016609, + "grad_norm": 0.45849281549453735, + "learning_rate": 0.0002, + "loss": 0.2332, + "step": 10680 + }, + { + "epoch": 0.8786877168490422, + "grad_norm": 0.39788320660591125, + "learning_rate": 0.0002, + "loss": 0.2363, + "step": 10700 + }, + { + "epoch": 0.8803301237964236, + "grad_norm": 0.5124650597572327, + "learning_rate": 0.0002, + "loss": 0.2292, + "step": 10720 + }, + { + "epoch": 0.881972530743805, + "grad_norm": 0.48688754439353943, + "learning_rate": 0.0002, + "loss": 0.2444, + "step": 10740 + }, + { + "epoch": 0.8836149376911865, + "grad_norm": 0.46146026253700256, + "learning_rate": 0.0002, + "loss": 0.2473, + "step": 10760 + }, + { + "epoch": 0.8852573446385679, + "grad_norm": 0.38401076197624207, + "learning_rate": 0.0002, + "loss": 0.2441, + "step": 10780 + }, + { + "epoch": 0.8868997515859492, + "grad_norm": 0.4642081558704376, + "learning_rate": 0.0002, + "loss": 0.2338, + "step": 10800 + }, + { + "epoch": 0.8885421585333306, + "grad_norm": 0.378845751285553, + "learning_rate": 0.0002, + "loss": 0.2203, + "step": 10820 + }, + { + "epoch": 0.890184565480712, + "grad_norm": 0.3785631060600281, + "learning_rate": 0.0002, + "loss": 0.2474, + "step": 10840 + }, + { + "epoch": 0.8918269724280934, + "grad_norm": 0.4151659309864044, + "learning_rate": 0.0002, + "loss": 0.2361, + "step": 10860 + }, + { + "epoch": 0.8934693793754748, + "grad_norm": 0.3314524292945862, + "learning_rate": 0.0002, + "loss": 0.241, + "step": 10880 + }, + { + "epoch": 0.8951117863228562, + "grad_norm": 0.4619898200035095, + "learning_rate": 0.0002, + "loss": 0.2426, + "step": 10900 + }, + { + "epoch": 0.8967541932702375, + "grad_norm": 0.5724550485610962, + "learning_rate": 0.0002, + "loss": 0.2455, + "step": 10920 + }, + { + "epoch": 0.8983966002176189, + "grad_norm": 0.3766199052333832, + "learning_rate": 0.0002, + "loss": 0.2319, + "step": 10940 + }, + { + "epoch": 0.9000390071650003, + "grad_norm": 0.4241611659526825, + "learning_rate": 0.0002, + "loss": 0.2316, + "step": 10960 + }, + { + "epoch": 0.9016814141123817, + "grad_norm": 0.35726866126060486, + "learning_rate": 0.0002, + "loss": 0.2343, + "step": 10980 + }, + { + "epoch": 0.903323821059763, + "grad_norm": 0.5252423882484436, + "learning_rate": 0.0002, + "loss": 0.2431, + "step": 11000 + }, + { + "epoch": 0.9049662280071444, + "grad_norm": 0.47167885303497314, + "learning_rate": 0.0002, + "loss": 0.2512, + "step": 11020 + }, + { + "epoch": 0.9066086349545258, + "grad_norm": 0.4106541872024536, + "learning_rate": 0.0002, + "loss": 0.2397, + "step": 11040 + }, + { + "epoch": 0.9082510419019072, + "grad_norm": 0.4804975390434265, + "learning_rate": 0.0002, + "loss": 0.2445, + "step": 11060 + }, + { + "epoch": 0.9098934488492886, + "grad_norm": 0.4177796542644501, + "learning_rate": 0.0002, + "loss": 0.2302, + "step": 11080 + }, + { + "epoch": 0.9115358557966701, + "grad_norm": 0.34781017899513245, + "learning_rate": 0.0002, + "loss": 0.2285, + "step": 11100 + }, + { + "epoch": 0.9131782627440514, + "grad_norm": 0.34392043948173523, + "learning_rate": 0.0002, + "loss": 0.232, + "step": 11120 + }, + { + "epoch": 0.9148206696914328, + "grad_norm": 0.46544018387794495, + "learning_rate": 0.0002, + "loss": 0.2332, + "step": 11140 + }, + { + "epoch": 0.9164630766388142, + "grad_norm": 0.47958704829216003, + "learning_rate": 0.0002, + "loss": 0.2481, + "step": 11160 + }, + { + "epoch": 0.9181054835861956, + "grad_norm": 0.4493333697319031, + "learning_rate": 0.0002, + "loss": 0.238, + "step": 11180 + }, + { + "epoch": 0.919747890533577, + "grad_norm": 0.47599494457244873, + "learning_rate": 0.0002, + "loss": 0.2416, + "step": 11200 + }, + { + "epoch": 0.9213902974809584, + "grad_norm": 0.39547592401504517, + "learning_rate": 0.0002, + "loss": 0.2456, + "step": 11220 + }, + { + "epoch": 0.9230327044283397, + "grad_norm": 0.42187511920928955, + "learning_rate": 0.0002, + "loss": 0.2425, + "step": 11240 + }, + { + "epoch": 0.9246751113757211, + "grad_norm": 0.3870528042316437, + "learning_rate": 0.0002, + "loss": 0.2366, + "step": 11260 + }, + { + "epoch": 0.9263175183231025, + "grad_norm": 0.40943118929862976, + "learning_rate": 0.0002, + "loss": 0.2088, + "step": 11280 + }, + { + "epoch": 0.9279599252704839, + "grad_norm": 0.3936561346054077, + "learning_rate": 0.0002, + "loss": 0.239, + "step": 11300 + }, + { + "epoch": 0.9296023322178653, + "grad_norm": 0.4154857397079468, + "learning_rate": 0.0002, + "loss": 0.2413, + "step": 11320 + }, + { + "epoch": 0.9312447391652466, + "grad_norm": 0.5544102191925049, + "learning_rate": 0.0002, + "loss": 0.2565, + "step": 11340 + }, + { + "epoch": 0.932887146112628, + "grad_norm": 0.5494611263275146, + "learning_rate": 0.0002, + "loss": 0.2469, + "step": 11360 + }, + { + "epoch": 0.9345295530600094, + "grad_norm": 0.41848114132881165, + "learning_rate": 0.0002, + "loss": 0.2333, + "step": 11380 + }, + { + "epoch": 0.9361719600073908, + "grad_norm": 0.41343703866004944, + "learning_rate": 0.0002, + "loss": 0.2342, + "step": 11400 + }, + { + "epoch": 0.9378143669547723, + "grad_norm": 0.6060330867767334, + "learning_rate": 0.0002, + "loss": 0.2507, + "step": 11420 + }, + { + "epoch": 0.9394567739021537, + "grad_norm": 0.42079275846481323, + "learning_rate": 0.0002, + "loss": 0.2322, + "step": 11440 + }, + { + "epoch": 0.941099180849535, + "grad_norm": 0.43053537607192993, + "learning_rate": 0.0002, + "loss": 0.2257, + "step": 11460 + }, + { + "epoch": 0.9427415877969164, + "grad_norm": 0.41895121335983276, + "learning_rate": 0.0002, + "loss": 0.2501, + "step": 11480 + }, + { + "epoch": 0.9443839947442978, + "grad_norm": 0.467018723487854, + "learning_rate": 0.0002, + "loss": 0.2282, + "step": 11500 + }, + { + "epoch": 0.9460264016916792, + "grad_norm": 0.5707799196243286, + "learning_rate": 0.0002, + "loss": 0.2319, + "step": 11520 + }, + { + "epoch": 0.9476688086390606, + "grad_norm": 0.4575120806694031, + "learning_rate": 0.0002, + "loss": 0.2291, + "step": 11540 + }, + { + "epoch": 0.9493112155864419, + "grad_norm": 0.38349372148513794, + "learning_rate": 0.0002, + "loss": 0.2263, + "step": 11560 + }, + { + "epoch": 0.9509536225338233, + "grad_norm": 0.4487491846084595, + "learning_rate": 0.0002, + "loss": 0.2505, + "step": 11580 + }, + { + "epoch": 0.9525960294812047, + "grad_norm": 0.39065688848495483, + "learning_rate": 0.0002, + "loss": 0.239, + "step": 11600 + }, + { + "epoch": 0.9542384364285861, + "grad_norm": 0.4473966658115387, + "learning_rate": 0.0002, + "loss": 0.2409, + "step": 11620 + }, + { + "epoch": 0.9558808433759675, + "grad_norm": 0.39066895842552185, + "learning_rate": 0.0002, + "loss": 0.2431, + "step": 11640 + }, + { + "epoch": 0.9575232503233488, + "grad_norm": 0.470277339220047, + "learning_rate": 0.0002, + "loss": 0.2419, + "step": 11660 + }, + { + "epoch": 0.9591656572707302, + "grad_norm": 0.405834436416626, + "learning_rate": 0.0002, + "loss": 0.2408, + "step": 11680 + }, + { + "epoch": 0.9608080642181116, + "grad_norm": 0.5717544555664062, + "learning_rate": 0.0002, + "loss": 0.2352, + "step": 11700 + }, + { + "epoch": 0.962450471165493, + "grad_norm": 0.4837093651294708, + "learning_rate": 0.0002, + "loss": 0.2435, + "step": 11720 + }, + { + "epoch": 0.9640928781128744, + "grad_norm": 0.4689130187034607, + "learning_rate": 0.0002, + "loss": 0.2324, + "step": 11740 + }, + { + "epoch": 0.9657352850602559, + "grad_norm": 0.511249840259552, + "learning_rate": 0.0002, + "loss": 0.2394, + "step": 11760 + }, + { + "epoch": 0.9673776920076372, + "grad_norm": 0.43555593490600586, + "learning_rate": 0.0002, + "loss": 0.2377, + "step": 11780 + }, + { + "epoch": 0.9690200989550186, + "grad_norm": 0.41933077573776245, + "learning_rate": 0.0002, + "loss": 0.2355, + "step": 11800 + }, + { + "epoch": 0.9706625059024, + "grad_norm": 0.41573819518089294, + "learning_rate": 0.0002, + "loss": 0.2345, + "step": 11820 + }, + { + "epoch": 0.9723049128497814, + "grad_norm": 0.3951037526130676, + "learning_rate": 0.0002, + "loss": 0.2399, + "step": 11840 + }, + { + "epoch": 0.9739473197971628, + "grad_norm": 0.477756142616272, + "learning_rate": 0.0002, + "loss": 0.2425, + "step": 11860 + }, + { + "epoch": 0.9755897267445441, + "grad_norm": 0.5147901773452759, + "learning_rate": 0.0002, + "loss": 0.2354, + "step": 11880 + }, + { + "epoch": 0.9772321336919255, + "grad_norm": 0.40053385496139526, + "learning_rate": 0.0002, + "loss": 0.2325, + "step": 11900 + }, + { + "epoch": 0.9788745406393069, + "grad_norm": 0.4459463953971863, + "learning_rate": 0.0002, + "loss": 0.2492, + "step": 11920 + }, + { + "epoch": 0.9805169475866883, + "grad_norm": 0.42749595642089844, + "learning_rate": 0.0002, + "loss": 0.2308, + "step": 11940 + }, + { + "epoch": 0.9821593545340697, + "grad_norm": 0.4053783714771271, + "learning_rate": 0.0002, + "loss": 0.2263, + "step": 11960 + }, + { + "epoch": 0.983801761481451, + "grad_norm": 0.43342533707618713, + "learning_rate": 0.0002, + "loss": 0.2348, + "step": 11980 + }, + { + "epoch": 0.9854441684288324, + "grad_norm": 0.43272313475608826, + "learning_rate": 0.0002, + "loss": 0.2234, + "step": 12000 + }, + { + "epoch": 0.9870865753762138, + "grad_norm": 0.3550325036048889, + "learning_rate": 0.0002, + "loss": 0.2186, + "step": 12020 + }, + { + "epoch": 0.9887289823235952, + "grad_norm": 0.35271936655044556, + "learning_rate": 0.0002, + "loss": 0.2326, + "step": 12040 + }, + { + "epoch": 0.9903713892709766, + "grad_norm": 0.37404924631118774, + "learning_rate": 0.0002, + "loss": 0.2483, + "step": 12060 + }, + { + "epoch": 0.9920137962183581, + "grad_norm": 0.46686896681785583, + "learning_rate": 0.0002, + "loss": 0.2213, + "step": 12080 + }, + { + "epoch": 0.9936562031657394, + "grad_norm": 0.37012913823127747, + "learning_rate": 0.0002, + "loss": 0.2415, + "step": 12100 + }, + { + "epoch": 0.9952986101131208, + "grad_norm": 0.4403967559337616, + "learning_rate": 0.0002, + "loss": 0.2261, + "step": 12120 + }, + { + "epoch": 0.9969410170605022, + "grad_norm": 0.36877259612083435, + "learning_rate": 0.0002, + "loss": 0.2295, + "step": 12140 + }, + { + "epoch": 0.9985834240078836, + "grad_norm": 0.34526777267456055, + "learning_rate": 0.0002, + "loss": 0.2236, + "step": 12160 + }, { "epoch": 1.0, - "eval_loss": 0.30941203236579895, - "eval_runtime": 408.7196, - "eval_samples_per_second": 7.083, - "eval_steps_per_second": 0.886, - "step": 8786 + "eval_loss": 0.30336490273475647, + "eval_runtime": 533.8677, + "eval_samples_per_second": 7.092, + "eval_steps_per_second": 0.888, + "step": 12178 } ], "logging_steps": 20, - "max_steps": 13000, + "max_steps": 14000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 77, @@ -3107,7 +4290,7 @@ "attributes": {} } }, - "total_flos": 2.923169198364426e+18, + "total_flos": 4.0518674601423667e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null