{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1061208, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014134835018205668, "grad_norm": 3.2685067653656006, "learning_rate": 4.9976441941636326e-05, "loss": 1.5678, "step": 500 }, { "epoch": 0.0028269670036411335, "grad_norm": 2.980227470397949, "learning_rate": 4.995288388327265e-05, "loss": 1.403, "step": 1000 }, { "epoch": 0.0042404505054617, "grad_norm": 3.102825403213501, "learning_rate": 4.9929325824908973e-05, "loss": 1.3368, "step": 1500 }, { "epoch": 0.005653934007282267, "grad_norm": 3.1001083850860596, "learning_rate": 4.99057677665453e-05, "loss": 1.2861, "step": 2000 }, { "epoch": 0.007067417509102834, "grad_norm": 2.820470094680786, "learning_rate": 4.988220970818162e-05, "loss": 1.2525, "step": 2500 }, { "epoch": 0.0084809010109234, "grad_norm": 2.7294390201568604, "learning_rate": 4.9858651649817944e-05, "loss": 1.2147, "step": 3000 }, { "epoch": 0.009894384512743968, "grad_norm": 2.6046652793884277, "learning_rate": 4.983509359145427e-05, "loss": 1.1935, "step": 3500 }, { "epoch": 0.011307868014564534, "grad_norm": 2.691831588745117, "learning_rate": 4.981153553309059e-05, "loss": 1.1695, "step": 4000 }, { "epoch": 0.0127213515163851, "grad_norm": 2.6256425380706787, "learning_rate": 4.9787977474726915e-05, "loss": 1.1492, "step": 4500 }, { "epoch": 0.014134835018205668, "grad_norm": 2.6606156826019287, "learning_rate": 4.976441941636324e-05, "loss": 1.1298, "step": 5000 }, { "epoch": 0.015548318520026234, "grad_norm": 2.5861198902130127, "learning_rate": 4.974086135799956e-05, "loss": 1.1116, "step": 5500 }, { "epoch": 0.0169618020218468, "grad_norm": 2.479706048965454, "learning_rate": 4.9717303299635886e-05, "loss": 1.1076, "step": 6000 }, { "epoch": 0.018375285523667368, "grad_norm": 2.5231101512908936, "learning_rate": 4.969374524127221e-05, "loss": 1.0928, "step": 6500 }, { "epoch": 0.019788769025487936, "grad_norm": 2.448791265487671, "learning_rate": 4.9670187182908534e-05, "loss": 1.0745, "step": 7000 }, { "epoch": 0.0212022525273085, "grad_norm": 2.5575363636016846, "learning_rate": 4.9646629124544864e-05, "loss": 1.0666, "step": 7500 }, { "epoch": 0.022615736029129068, "grad_norm": 2.460310220718384, "learning_rate": 4.962307106618119e-05, "loss": 1.0574, "step": 8000 }, { "epoch": 0.024029219530949636, "grad_norm": 2.2827608585357666, "learning_rate": 4.959951300781751e-05, "loss": 1.0548, "step": 8500 }, { "epoch": 0.0254427030327702, "grad_norm": 2.3516390323638916, "learning_rate": 4.9575954949453835e-05, "loss": 1.0369, "step": 9000 }, { "epoch": 0.026856186534590768, "grad_norm": 2.470848560333252, "learning_rate": 4.955239689109016e-05, "loss": 1.028, "step": 9500 }, { "epoch": 0.028269670036411336, "grad_norm": 2.2024457454681396, "learning_rate": 4.952883883272648e-05, "loss": 1.0184, "step": 10000 }, { "epoch": 0.0296831535382319, "grad_norm": 2.343493938446045, "learning_rate": 4.9505280774362806e-05, "loss": 1.0101, "step": 10500 }, { "epoch": 0.031096637040052468, "grad_norm": 2.277790069580078, "learning_rate": 4.948172271599913e-05, "loss": 1.0003, "step": 11000 }, { "epoch": 0.032510120541873036, "grad_norm": 2.7002475261688232, "learning_rate": 4.9458164657635454e-05, "loss": 0.9988, "step": 11500 }, { "epoch": 0.0339236040436936, "grad_norm": 2.4524242877960205, "learning_rate": 4.943460659927178e-05, "loss": 0.9928, "step": 12000 }, { "epoch": 0.03533708754551417, "grad_norm": 2.5849318504333496, "learning_rate": 4.94110485409081e-05, "loss": 0.9797, "step": 12500 }, { "epoch": 0.036750571047334736, "grad_norm": 2.3430843353271484, "learning_rate": 4.9387490482544425e-05, "loss": 0.9719, "step": 13000 }, { "epoch": 0.0381640545491553, "grad_norm": 2.382037878036499, "learning_rate": 4.936393242418075e-05, "loss": 0.9733, "step": 13500 }, { "epoch": 0.03957753805097587, "grad_norm": 2.3416788578033447, "learning_rate": 4.934037436581707e-05, "loss": 0.9746, "step": 14000 }, { "epoch": 0.040991021552796436, "grad_norm": 2.247741222381592, "learning_rate": 4.9316816307453396e-05, "loss": 0.9654, "step": 14500 }, { "epoch": 0.042404505054617, "grad_norm": 2.3570547103881836, "learning_rate": 4.929325824908972e-05, "loss": 0.9567, "step": 15000 }, { "epoch": 0.04381798855643757, "grad_norm": 2.4221949577331543, "learning_rate": 4.926970019072604e-05, "loss": 0.9545, "step": 15500 }, { "epoch": 0.045231472058258136, "grad_norm": 2.4117672443389893, "learning_rate": 4.924614213236237e-05, "loss": 0.9495, "step": 16000 }, { "epoch": 0.0466449555600787, "grad_norm": 2.1933741569519043, "learning_rate": 4.922258407399869e-05, "loss": 0.9417, "step": 16500 }, { "epoch": 0.04805843906189927, "grad_norm": 2.1862645149230957, "learning_rate": 4.9199026015635014e-05, "loss": 0.9388, "step": 17000 }, { "epoch": 0.049471922563719836, "grad_norm": 2.223505973815918, "learning_rate": 4.917546795727134e-05, "loss": 0.9334, "step": 17500 }, { "epoch": 0.0508854060655404, "grad_norm": 2.3838627338409424, "learning_rate": 4.915190989890766e-05, "loss": 0.9329, "step": 18000 }, { "epoch": 0.05229888956736097, "grad_norm": 2.3882734775543213, "learning_rate": 4.9128351840543985e-05, "loss": 0.9221, "step": 18500 }, { "epoch": 0.053712373069181536, "grad_norm": 2.0582985877990723, "learning_rate": 4.910479378218031e-05, "loss": 0.9235, "step": 19000 }, { "epoch": 0.0551258565710021, "grad_norm": 2.3134665489196777, "learning_rate": 4.908123572381663e-05, "loss": 0.9174, "step": 19500 }, { "epoch": 0.05653934007282267, "grad_norm": 2.366582155227661, "learning_rate": 4.9057677665452956e-05, "loss": 0.9124, "step": 20000 }, { "epoch": 0.057952823574643236, "grad_norm": 2.3792452812194824, "learning_rate": 4.903411960708928e-05, "loss": 0.9116, "step": 20500 }, { "epoch": 0.0593663070764638, "grad_norm": 2.7291083335876465, "learning_rate": 4.9010561548725603e-05, "loss": 0.9046, "step": 21000 }, { "epoch": 0.06077979057828437, "grad_norm": 2.2825348377227783, "learning_rate": 4.898700349036193e-05, "loss": 0.9077, "step": 21500 }, { "epoch": 0.062193274080104936, "grad_norm": 2.181398391723633, "learning_rate": 4.896344543199826e-05, "loss": 0.9005, "step": 22000 }, { "epoch": 0.06360675758192551, "grad_norm": 2.112208604812622, "learning_rate": 4.893988737363458e-05, "loss": 0.895, "step": 22500 }, { "epoch": 0.06502024108374607, "grad_norm": 2.347834587097168, "learning_rate": 4.8916329315270905e-05, "loss": 0.8923, "step": 23000 }, { "epoch": 0.06643372458556664, "grad_norm": 2.2709128856658936, "learning_rate": 4.889277125690723e-05, "loss": 0.8885, "step": 23500 }, { "epoch": 0.0678472080873872, "grad_norm": 2.1835873126983643, "learning_rate": 4.886921319854355e-05, "loss": 0.8838, "step": 24000 }, { "epoch": 0.06926069158920777, "grad_norm": 2.3090507984161377, "learning_rate": 4.8845655140179876e-05, "loss": 0.8802, "step": 24500 }, { "epoch": 0.07067417509102834, "grad_norm": 2.1905269622802734, "learning_rate": 4.88220970818162e-05, "loss": 0.8777, "step": 25000 }, { "epoch": 0.07208765859284891, "grad_norm": 2.1602389812469482, "learning_rate": 4.879853902345252e-05, "loss": 0.8783, "step": 25500 }, { "epoch": 0.07350114209466947, "grad_norm": 2.2013607025146484, "learning_rate": 4.877498096508885e-05, "loss": 0.8709, "step": 26000 }, { "epoch": 0.07491462559649004, "grad_norm": 2.2046091556549072, "learning_rate": 4.875142290672517e-05, "loss": 0.8815, "step": 26500 }, { "epoch": 0.0763281090983106, "grad_norm": 2.103070020675659, "learning_rate": 4.8727864848361494e-05, "loss": 0.869, "step": 27000 }, { "epoch": 0.07774159260013117, "grad_norm": 2.249695301055908, "learning_rate": 4.870430678999782e-05, "loss": 0.8672, "step": 27500 }, { "epoch": 0.07915507610195174, "grad_norm": 2.0575194358825684, "learning_rate": 4.868074873163414e-05, "loss": 0.8673, "step": 28000 }, { "epoch": 0.08056855960377231, "grad_norm": 2.260659694671631, "learning_rate": 4.8657190673270465e-05, "loss": 0.8642, "step": 28500 }, { "epoch": 0.08198204310559287, "grad_norm": 2.003070116043091, "learning_rate": 4.863363261490679e-05, "loss": 0.8557, "step": 29000 }, { "epoch": 0.08339552660741344, "grad_norm": 2.3090686798095703, "learning_rate": 4.861007455654311e-05, "loss": 0.8543, "step": 29500 }, { "epoch": 0.084809010109234, "grad_norm": 1.9025274515151978, "learning_rate": 4.8586516498179436e-05, "loss": 0.8566, "step": 30000 }, { "epoch": 0.08622249361105457, "grad_norm": 2.0132434368133545, "learning_rate": 4.856295843981576e-05, "loss": 0.8481, "step": 30500 }, { "epoch": 0.08763597711287514, "grad_norm": 2.1482937335968018, "learning_rate": 4.8539400381452084e-05, "loss": 0.8473, "step": 31000 }, { "epoch": 0.08904946061469571, "grad_norm": 2.230346202850342, "learning_rate": 4.851584232308841e-05, "loss": 0.8473, "step": 31500 }, { "epoch": 0.09046294411651627, "grad_norm": 2.2641255855560303, "learning_rate": 4.849228426472473e-05, "loss": 0.8484, "step": 32000 }, { "epoch": 0.09187642761833684, "grad_norm": 2.2655484676361084, "learning_rate": 4.8468726206361055e-05, "loss": 0.845, "step": 32500 }, { "epoch": 0.0932899111201574, "grad_norm": 2.1780354976654053, "learning_rate": 4.844516814799738e-05, "loss": 0.8419, "step": 33000 }, { "epoch": 0.09470339462197797, "grad_norm": 2.440086603164673, "learning_rate": 4.84216100896337e-05, "loss": 0.8382, "step": 33500 }, { "epoch": 0.09611687812379854, "grad_norm": 2.0652029514312744, "learning_rate": 4.8398052031270026e-05, "loss": 0.8344, "step": 34000 }, { "epoch": 0.09753036162561911, "grad_norm": 2.087705373764038, "learning_rate": 4.837449397290635e-05, "loss": 0.8404, "step": 34500 }, { "epoch": 0.09894384512743967, "grad_norm": 2.203565835952759, "learning_rate": 4.835093591454267e-05, "loss": 0.8369, "step": 35000 }, { "epoch": 0.10035732862926024, "grad_norm": 2.5251693725585938, "learning_rate": 4.8327377856179e-05, "loss": 0.8343, "step": 35500 }, { "epoch": 0.1017708121310808, "grad_norm": 2.1679134368896484, "learning_rate": 4.830381979781532e-05, "loss": 0.8251, "step": 36000 }, { "epoch": 0.10318429563290138, "grad_norm": 2.189016103744507, "learning_rate": 4.828026173945165e-05, "loss": 0.8316, "step": 36500 }, { "epoch": 0.10459777913472194, "grad_norm": 2.2385709285736084, "learning_rate": 4.8256703681087975e-05, "loss": 0.8233, "step": 37000 }, { "epoch": 0.10601126263654251, "grad_norm": 2.1296896934509277, "learning_rate": 4.82331456227243e-05, "loss": 0.8252, "step": 37500 }, { "epoch": 0.10742474613836307, "grad_norm": 2.3188064098358154, "learning_rate": 4.820958756436062e-05, "loss": 0.8206, "step": 38000 }, { "epoch": 0.10883822964018364, "grad_norm": 2.0144803524017334, "learning_rate": 4.8186029505996946e-05, "loss": 0.8162, "step": 38500 }, { "epoch": 0.1102517131420042, "grad_norm": 2.4890646934509277, "learning_rate": 4.816247144763327e-05, "loss": 0.8212, "step": 39000 }, { "epoch": 0.11166519664382478, "grad_norm": 2.0380167961120605, "learning_rate": 4.813891338926959e-05, "loss": 0.8189, "step": 39500 }, { "epoch": 0.11307868014564534, "grad_norm": 1.9491159915924072, "learning_rate": 4.811535533090592e-05, "loss": 0.8173, "step": 40000 }, { "epoch": 0.11449216364746591, "grad_norm": 2.1123366355895996, "learning_rate": 4.809179727254224e-05, "loss": 0.8197, "step": 40500 }, { "epoch": 0.11590564714928647, "grad_norm": 1.920899510383606, "learning_rate": 4.8068239214178564e-05, "loss": 0.812, "step": 41000 }, { "epoch": 0.11731913065110704, "grad_norm": 2.0682876110076904, "learning_rate": 4.804468115581489e-05, "loss": 0.8085, "step": 41500 }, { "epoch": 0.1187326141529276, "grad_norm": 2.029426336288452, "learning_rate": 4.802112309745121e-05, "loss": 0.8109, "step": 42000 }, { "epoch": 0.12014609765474818, "grad_norm": 2.2340686321258545, "learning_rate": 4.7997565039087535e-05, "loss": 0.8076, "step": 42500 }, { "epoch": 0.12155958115656874, "grad_norm": 2.141270160675049, "learning_rate": 4.797400698072386e-05, "loss": 0.8123, "step": 43000 }, { "epoch": 0.12297306465838931, "grad_norm": 2.059101104736328, "learning_rate": 4.795044892236018e-05, "loss": 0.8089, "step": 43500 }, { "epoch": 0.12438654816020987, "grad_norm": 2.00829815864563, "learning_rate": 4.7926890863996506e-05, "loss": 0.8054, "step": 44000 }, { "epoch": 0.12580003166203044, "grad_norm": 2.1688382625579834, "learning_rate": 4.790333280563283e-05, "loss": 0.7977, "step": 44500 }, { "epoch": 0.12721351516385102, "grad_norm": 2.327826499938965, "learning_rate": 4.7879774747269153e-05, "loss": 0.8013, "step": 45000 }, { "epoch": 0.12862699866567157, "grad_norm": 1.951831579208374, "learning_rate": 4.785621668890548e-05, "loss": 0.796, "step": 45500 }, { "epoch": 0.13004048216749214, "grad_norm": 2.0565438270568848, "learning_rate": 4.78326586305418e-05, "loss": 0.8012, "step": 46000 }, { "epoch": 0.1314539656693127, "grad_norm": 2.1289477348327637, "learning_rate": 4.7809100572178124e-05, "loss": 0.7951, "step": 46500 }, { "epoch": 0.13286744917113327, "grad_norm": 1.862220287322998, "learning_rate": 4.778554251381445e-05, "loss": 0.7957, "step": 47000 }, { "epoch": 0.13428093267295385, "grad_norm": 2.18941068649292, "learning_rate": 4.776198445545077e-05, "loss": 0.792, "step": 47500 }, { "epoch": 0.1356944161747744, "grad_norm": 2.0579240322113037, "learning_rate": 4.7738426397087095e-05, "loss": 0.7898, "step": 48000 }, { "epoch": 0.13710789967659498, "grad_norm": 2.274019718170166, "learning_rate": 4.771486833872342e-05, "loss": 0.7889, "step": 48500 }, { "epoch": 0.13852138317841553, "grad_norm": 2.0458860397338867, "learning_rate": 4.769131028035974e-05, "loss": 0.7893, "step": 49000 }, { "epoch": 0.1399348666802361, "grad_norm": 2.3238253593444824, "learning_rate": 4.7667752221996066e-05, "loss": 0.7842, "step": 49500 }, { "epoch": 0.1413483501820567, "grad_norm": 2.238711357116699, "learning_rate": 4.764419416363239e-05, "loss": 0.7881, "step": 50000 }, { "epoch": 0.14276183368387724, "grad_norm": 2.0283846855163574, "learning_rate": 4.7620636105268714e-05, "loss": 0.7849, "step": 50500 }, { "epoch": 0.14417531718569782, "grad_norm": 2.1474900245666504, "learning_rate": 4.7597078046905044e-05, "loss": 0.7819, "step": 51000 }, { "epoch": 0.14558880068751837, "grad_norm": 1.918904423713684, "learning_rate": 4.757351998854137e-05, "loss": 0.7775, "step": 51500 }, { "epoch": 0.14700228418933894, "grad_norm": 2.094169855117798, "learning_rate": 4.754996193017769e-05, "loss": 0.7765, "step": 52000 }, { "epoch": 0.14841576769115952, "grad_norm": 2.204805374145508, "learning_rate": 4.7526403871814015e-05, "loss": 0.7795, "step": 52500 }, { "epoch": 0.14982925119298007, "grad_norm": 1.9685603380203247, "learning_rate": 4.750284581345034e-05, "loss": 0.7821, "step": 53000 }, { "epoch": 0.15124273469480065, "grad_norm": 1.9384974241256714, "learning_rate": 4.747928775508666e-05, "loss": 0.7776, "step": 53500 }, { "epoch": 0.1526562181966212, "grad_norm": 2.17044734954834, "learning_rate": 4.7455729696722986e-05, "loss": 0.7718, "step": 54000 }, { "epoch": 0.15406970169844178, "grad_norm": 2.0725958347320557, "learning_rate": 4.743217163835931e-05, "loss": 0.7747, "step": 54500 }, { "epoch": 0.15548318520026233, "grad_norm": 2.083662748336792, "learning_rate": 4.7408613579995634e-05, "loss": 0.7729, "step": 55000 }, { "epoch": 0.1568966687020829, "grad_norm": 2.002969980239868, "learning_rate": 4.738505552163196e-05, "loss": 0.7751, "step": 55500 }, { "epoch": 0.1583101522039035, "grad_norm": 2.0446624755859375, "learning_rate": 4.736149746326828e-05, "loss": 0.7729, "step": 56000 }, { "epoch": 0.15972363570572404, "grad_norm": 2.021606206893921, "learning_rate": 4.7337939404904605e-05, "loss": 0.7736, "step": 56500 }, { "epoch": 0.16113711920754462, "grad_norm": 2.0994176864624023, "learning_rate": 4.731438134654093e-05, "loss": 0.7734, "step": 57000 }, { "epoch": 0.16255060270936517, "grad_norm": 1.966751217842102, "learning_rate": 4.729082328817725e-05, "loss": 0.7703, "step": 57500 }, { "epoch": 0.16396408621118574, "grad_norm": 2.1273880004882812, "learning_rate": 4.7267265229813576e-05, "loss": 0.769, "step": 58000 }, { "epoch": 0.16537756971300632, "grad_norm": 2.1192543506622314, "learning_rate": 4.72437071714499e-05, "loss": 0.7655, "step": 58500 }, { "epoch": 0.16679105321482687, "grad_norm": 1.9828897714614868, "learning_rate": 4.722014911308622e-05, "loss": 0.771, "step": 59000 }, { "epoch": 0.16820453671664745, "grad_norm": 2.1357827186584473, "learning_rate": 4.719659105472255e-05, "loss": 0.7673, "step": 59500 }, { "epoch": 0.169618020218468, "grad_norm": 2.1103930473327637, "learning_rate": 4.717303299635887e-05, "loss": 0.7714, "step": 60000 }, { "epoch": 0.17103150372028858, "grad_norm": 2.0952727794647217, "learning_rate": 4.7149474937995194e-05, "loss": 0.7655, "step": 60500 }, { "epoch": 0.17244498722210913, "grad_norm": 2.2258007526397705, "learning_rate": 4.712591687963152e-05, "loss": 0.7631, "step": 61000 }, { "epoch": 0.1738584707239297, "grad_norm": 1.80638587474823, "learning_rate": 4.710235882126784e-05, "loss": 0.7551, "step": 61500 }, { "epoch": 0.1752719542257503, "grad_norm": 2.081979751586914, "learning_rate": 4.7078800762904165e-05, "loss": 0.7622, "step": 62000 }, { "epoch": 0.17668543772757084, "grad_norm": 2.0041158199310303, "learning_rate": 4.705524270454049e-05, "loss": 0.7609, "step": 62500 }, { "epoch": 0.17809892122939142, "grad_norm": 2.0636978149414062, "learning_rate": 4.703168464617681e-05, "loss": 0.7618, "step": 63000 }, { "epoch": 0.17951240473121197, "grad_norm": 1.962123155593872, "learning_rate": 4.7008126587813136e-05, "loss": 0.7689, "step": 63500 }, { "epoch": 0.18092588823303254, "grad_norm": 2.0643563270568848, "learning_rate": 4.698456852944946e-05, "loss": 0.7603, "step": 64000 }, { "epoch": 0.18233937173485312, "grad_norm": 1.9330848455429077, "learning_rate": 4.6961010471085783e-05, "loss": 0.7536, "step": 64500 }, { "epoch": 0.18375285523667367, "grad_norm": 2.0168087482452393, "learning_rate": 4.693745241272211e-05, "loss": 0.7508, "step": 65000 }, { "epoch": 0.18516633873849425, "grad_norm": 2.137385129928589, "learning_rate": 4.691389435435843e-05, "loss": 0.7511, "step": 65500 }, { "epoch": 0.1865798222403148, "grad_norm": 1.9971131086349487, "learning_rate": 4.6890336295994754e-05, "loss": 0.7514, "step": 66000 }, { "epoch": 0.18799330574213538, "grad_norm": 2.2640182971954346, "learning_rate": 4.686677823763108e-05, "loss": 0.7525, "step": 66500 }, { "epoch": 0.18940678924395593, "grad_norm": 2.118194580078125, "learning_rate": 4.684322017926741e-05, "loss": 0.7543, "step": 67000 }, { "epoch": 0.1908202727457765, "grad_norm": 2.0297279357910156, "learning_rate": 4.681966212090373e-05, "loss": 0.7516, "step": 67500 }, { "epoch": 0.1922337562475971, "grad_norm": 2.082000494003296, "learning_rate": 4.6796104062540056e-05, "loss": 0.7497, "step": 68000 }, { "epoch": 0.19364723974941764, "grad_norm": 2.0502378940582275, "learning_rate": 4.677254600417638e-05, "loss": 0.7538, "step": 68500 }, { "epoch": 0.19506072325123822, "grad_norm": 2.0454342365264893, "learning_rate": 4.67489879458127e-05, "loss": 0.7445, "step": 69000 }, { "epoch": 0.19647420675305877, "grad_norm": 2.2623260021209717, "learning_rate": 4.672542988744903e-05, "loss": 0.7473, "step": 69500 }, { "epoch": 0.19788769025487934, "grad_norm": 1.822770595550537, "learning_rate": 4.670187182908535e-05, "loss": 0.7482, "step": 70000 }, { "epoch": 0.19930117375669992, "grad_norm": 1.8823920488357544, "learning_rate": 4.6678313770721674e-05, "loss": 0.7443, "step": 70500 }, { "epoch": 0.20071465725852047, "grad_norm": 2.0499703884124756, "learning_rate": 4.6654755712358e-05, "loss": 0.7499, "step": 71000 }, { "epoch": 0.20212814076034105, "grad_norm": 2.109389543533325, "learning_rate": 4.663119765399432e-05, "loss": 0.7532, "step": 71500 }, { "epoch": 0.2035416242621616, "grad_norm": 2.098217010498047, "learning_rate": 4.6607639595630645e-05, "loss": 0.7389, "step": 72000 }, { "epoch": 0.20495510776398218, "grad_norm": 2.110330820083618, "learning_rate": 4.658408153726697e-05, "loss": 0.7407, "step": 72500 }, { "epoch": 0.20636859126580276, "grad_norm": 2.0308709144592285, "learning_rate": 4.656052347890329e-05, "loss": 0.7436, "step": 73000 }, { "epoch": 0.2077820747676233, "grad_norm": 2.096755266189575, "learning_rate": 4.6536965420539616e-05, "loss": 0.7438, "step": 73500 }, { "epoch": 0.2091955582694439, "grad_norm": 1.9915010929107666, "learning_rate": 4.651340736217594e-05, "loss": 0.7377, "step": 74000 }, { "epoch": 0.21060904177126444, "grad_norm": 2.020451545715332, "learning_rate": 4.6489849303812264e-05, "loss": 0.7423, "step": 74500 }, { "epoch": 0.21202252527308502, "grad_norm": 2.0489823818206787, "learning_rate": 4.646629124544859e-05, "loss": 0.7343, "step": 75000 }, { "epoch": 0.21343600877490557, "grad_norm": 1.893576741218567, "learning_rate": 4.644273318708491e-05, "loss": 0.7386, "step": 75500 }, { "epoch": 0.21484949227672615, "grad_norm": 2.060296058654785, "learning_rate": 4.6419175128721235e-05, "loss": 0.7369, "step": 76000 }, { "epoch": 0.21626297577854672, "grad_norm": 1.844085931777954, "learning_rate": 4.639561707035756e-05, "loss": 0.7309, "step": 76500 }, { "epoch": 0.21767645928036727, "grad_norm": 2.092665433883667, "learning_rate": 4.637205901199388e-05, "loss": 0.7347, "step": 77000 }, { "epoch": 0.21908994278218785, "grad_norm": 2.127019166946411, "learning_rate": 4.6348500953630206e-05, "loss": 0.7291, "step": 77500 }, { "epoch": 0.2205034262840084, "grad_norm": 1.9473992586135864, "learning_rate": 4.632494289526653e-05, "loss": 0.738, "step": 78000 }, { "epoch": 0.22191690978582898, "grad_norm": 1.87107253074646, "learning_rate": 4.630138483690285e-05, "loss": 0.7337, "step": 78500 }, { "epoch": 0.22333039328764956, "grad_norm": 2.08552622795105, "learning_rate": 4.627782677853918e-05, "loss": 0.731, "step": 79000 }, { "epoch": 0.2247438767894701, "grad_norm": 2.0761590003967285, "learning_rate": 4.62542687201755e-05, "loss": 0.7308, "step": 79500 }, { "epoch": 0.2261573602912907, "grad_norm": 2.104128837585449, "learning_rate": 4.6230710661811824e-05, "loss": 0.729, "step": 80000 }, { "epoch": 0.22757084379311124, "grad_norm": 1.8827178478240967, "learning_rate": 4.620715260344815e-05, "loss": 0.7282, "step": 80500 }, { "epoch": 0.22898432729493182, "grad_norm": 1.964033842086792, "learning_rate": 4.618359454508447e-05, "loss": 0.7261, "step": 81000 }, { "epoch": 0.23039781079675237, "grad_norm": 2.149872064590454, "learning_rate": 4.6160036486720795e-05, "loss": 0.7274, "step": 81500 }, { "epoch": 0.23181129429857295, "grad_norm": 1.956879734992981, "learning_rate": 4.613647842835712e-05, "loss": 0.7202, "step": 82000 }, { "epoch": 0.23322477780039352, "grad_norm": 2.080933094024658, "learning_rate": 4.611292036999344e-05, "loss": 0.7296, "step": 82500 }, { "epoch": 0.23463826130221407, "grad_norm": 1.8324638605117798, "learning_rate": 4.6089362311629766e-05, "loss": 0.7278, "step": 83000 }, { "epoch": 0.23605174480403465, "grad_norm": 2.101034641265869, "learning_rate": 4.606580425326609e-05, "loss": 0.7303, "step": 83500 }, { "epoch": 0.2374652283058552, "grad_norm": 1.910130500793457, "learning_rate": 4.6042246194902414e-05, "loss": 0.7211, "step": 84000 }, { "epoch": 0.23887871180767578, "grad_norm": 2.014700174331665, "learning_rate": 4.601868813653874e-05, "loss": 0.7251, "step": 84500 }, { "epoch": 0.24029219530949636, "grad_norm": 1.9511390924453735, "learning_rate": 4.599513007817506e-05, "loss": 0.7244, "step": 85000 }, { "epoch": 0.2417056788113169, "grad_norm": 2.166968584060669, "learning_rate": 4.5971572019811385e-05, "loss": 0.7211, "step": 85500 }, { "epoch": 0.2431191623131375, "grad_norm": 2.052455186843872, "learning_rate": 4.594801396144771e-05, "loss": 0.7233, "step": 86000 }, { "epoch": 0.24453264581495804, "grad_norm": 1.9972022771835327, "learning_rate": 4.592445590308403e-05, "loss": 0.7249, "step": 86500 }, { "epoch": 0.24594612931677862, "grad_norm": 1.9930678606033325, "learning_rate": 4.5900897844720356e-05, "loss": 0.7205, "step": 87000 }, { "epoch": 0.2473596128185992, "grad_norm": 1.8885375261306763, "learning_rate": 4.587733978635668e-05, "loss": 0.7225, "step": 87500 }, { "epoch": 0.24877309632041975, "grad_norm": 1.9395737648010254, "learning_rate": 4.5853781727993e-05, "loss": 0.7155, "step": 88000 }, { "epoch": 0.2501865798222403, "grad_norm": 1.9710242748260498, "learning_rate": 4.5830223669629327e-05, "loss": 0.7195, "step": 88500 }, { "epoch": 0.2516000633240609, "grad_norm": 1.8622244596481323, "learning_rate": 4.580666561126565e-05, "loss": 0.7179, "step": 89000 }, { "epoch": 0.25301354682588145, "grad_norm": 30.77910614013672, "learning_rate": 4.5783107552901974e-05, "loss": 0.7205, "step": 89500 }, { "epoch": 0.25442703032770203, "grad_norm": 1.987046241760254, "learning_rate": 4.57595494945383e-05, "loss": 0.7171, "step": 90000 }, { "epoch": 0.2558405138295226, "grad_norm": 1.9005072116851807, "learning_rate": 4.573599143617462e-05, "loss": 0.718, "step": 90500 }, { "epoch": 0.25725399733134313, "grad_norm": 1.9660764932632446, "learning_rate": 4.5712433377810945e-05, "loss": 0.7145, "step": 91000 }, { "epoch": 0.2586674808331637, "grad_norm": 1.9201092720031738, "learning_rate": 4.568887531944727e-05, "loss": 0.7138, "step": 91500 }, { "epoch": 0.2600809643349843, "grad_norm": 1.9686309099197388, "learning_rate": 4.566531726108359e-05, "loss": 0.7121, "step": 92000 }, { "epoch": 0.26149444783680487, "grad_norm": 2.0785725116729736, "learning_rate": 4.5641759202719916e-05, "loss": 0.7151, "step": 92500 }, { "epoch": 0.2629079313386254, "grad_norm": 2.1494646072387695, "learning_rate": 4.561820114435624e-05, "loss": 0.7143, "step": 93000 }, { "epoch": 0.26432141484044597, "grad_norm": 2.1971096992492676, "learning_rate": 4.559464308599256e-05, "loss": 0.7112, "step": 93500 }, { "epoch": 0.26573489834226655, "grad_norm": 1.9555832147598267, "learning_rate": 4.557108502762889e-05, "loss": 0.7154, "step": 94000 }, { "epoch": 0.2671483818440871, "grad_norm": 1.9444291591644287, "learning_rate": 4.554752696926522e-05, "loss": 0.7107, "step": 94500 }, { "epoch": 0.2685618653459077, "grad_norm": 2.0291929244995117, "learning_rate": 4.552396891090154e-05, "loss": 0.7118, "step": 95000 }, { "epoch": 0.2699753488477282, "grad_norm": 1.7437961101531982, "learning_rate": 4.5500410852537865e-05, "loss": 0.7096, "step": 95500 }, { "epoch": 0.2713888323495488, "grad_norm": 2.0739834308624268, "learning_rate": 4.547685279417419e-05, "loss": 0.7107, "step": 96000 }, { "epoch": 0.2728023158513694, "grad_norm": 1.9369301795959473, "learning_rate": 4.545329473581051e-05, "loss": 0.7076, "step": 96500 }, { "epoch": 0.27421579935318996, "grad_norm": 2.31142258644104, "learning_rate": 4.5429736677446836e-05, "loss": 0.7047, "step": 97000 }, { "epoch": 0.27562928285501054, "grad_norm": 1.986330509185791, "learning_rate": 4.540617861908316e-05, "loss": 0.702, "step": 97500 }, { "epoch": 0.27704276635683106, "grad_norm": 1.816628098487854, "learning_rate": 4.538262056071948e-05, "loss": 0.7101, "step": 98000 }, { "epoch": 0.27845624985865164, "grad_norm": 2.0669784545898438, "learning_rate": 4.535906250235581e-05, "loss": 0.7112, "step": 98500 }, { "epoch": 0.2798697333604722, "grad_norm": 1.964211344718933, "learning_rate": 4.533550444399213e-05, "loss": 0.7052, "step": 99000 }, { "epoch": 0.2812832168622928, "grad_norm": 1.8037883043289185, "learning_rate": 4.5311946385628454e-05, "loss": 0.6964, "step": 99500 }, { "epoch": 0.2826967003641134, "grad_norm": 1.8163939714431763, "learning_rate": 4.528838832726478e-05, "loss": 0.7011, "step": 100000 }, { "epoch": 0.2841101838659339, "grad_norm": 2.103328227996826, "learning_rate": 4.52648302689011e-05, "loss": 0.7031, "step": 100500 }, { "epoch": 0.2855236673677545, "grad_norm": 1.9891421794891357, "learning_rate": 4.5241272210537425e-05, "loss": 0.7038, "step": 101000 }, { "epoch": 0.28693715086957505, "grad_norm": 1.8925179243087769, "learning_rate": 4.521771415217375e-05, "loss": 0.6985, "step": 101500 }, { "epoch": 0.28835063437139563, "grad_norm": 1.973712682723999, "learning_rate": 4.519415609381007e-05, "loss": 0.6981, "step": 102000 }, { "epoch": 0.2897641178732162, "grad_norm": 2.0795533657073975, "learning_rate": 4.5170598035446396e-05, "loss": 0.7049, "step": 102500 }, { "epoch": 0.29117760137503673, "grad_norm": 2.083008289337158, "learning_rate": 4.514703997708272e-05, "loss": 0.707, "step": 103000 }, { "epoch": 0.2925910848768573, "grad_norm": 1.8228724002838135, "learning_rate": 4.5123481918719044e-05, "loss": 0.696, "step": 103500 }, { "epoch": 0.2940045683786779, "grad_norm": 1.9347554445266724, "learning_rate": 4.509992386035537e-05, "loss": 0.6989, "step": 104000 }, { "epoch": 0.29541805188049847, "grad_norm": 2.0042245388031006, "learning_rate": 4.507636580199169e-05, "loss": 0.6991, "step": 104500 }, { "epoch": 0.29683153538231905, "grad_norm": 2.187929153442383, "learning_rate": 4.5052807743628015e-05, "loss": 0.6997, "step": 105000 }, { "epoch": 0.29824501888413957, "grad_norm": 1.896545171737671, "learning_rate": 4.502924968526434e-05, "loss": 0.702, "step": 105500 }, { "epoch": 0.29965850238596015, "grad_norm": 1.9436556100845337, "learning_rate": 4.500569162690066e-05, "loss": 0.7018, "step": 106000 }, { "epoch": 0.3010719858877807, "grad_norm": 2.024386167526245, "learning_rate": 4.4982133568536986e-05, "loss": 0.6984, "step": 106500 }, { "epoch": 0.3024854693896013, "grad_norm": 2.06805682182312, "learning_rate": 4.495857551017331e-05, "loss": 0.6999, "step": 107000 }, { "epoch": 0.3038989528914218, "grad_norm": 1.9496690034866333, "learning_rate": 4.493501745180963e-05, "loss": 0.6972, "step": 107500 }, { "epoch": 0.3053124363932424, "grad_norm": 1.8867801427841187, "learning_rate": 4.491145939344596e-05, "loss": 0.6983, "step": 108000 }, { "epoch": 0.306725919895063, "grad_norm": 1.9701323509216309, "learning_rate": 4.488790133508228e-05, "loss": 0.6938, "step": 108500 }, { "epoch": 0.30813940339688356, "grad_norm": 1.9740166664123535, "learning_rate": 4.486434327671861e-05, "loss": 0.6968, "step": 109000 }, { "epoch": 0.30955288689870414, "grad_norm": 1.901581883430481, "learning_rate": 4.4840785218354934e-05, "loss": 0.6965, "step": 109500 }, { "epoch": 0.31096637040052466, "grad_norm": 1.8771336078643799, "learning_rate": 4.481722715999126e-05, "loss": 0.6928, "step": 110000 }, { "epoch": 0.31237985390234524, "grad_norm": 1.7882453203201294, "learning_rate": 4.479366910162758e-05, "loss": 0.695, "step": 110500 }, { "epoch": 0.3137933374041658, "grad_norm": 2.0589098930358887, "learning_rate": 4.4770111043263906e-05, "loss": 0.6916, "step": 111000 }, { "epoch": 0.3152068209059864, "grad_norm": 1.8660404682159424, "learning_rate": 4.474655298490023e-05, "loss": 0.693, "step": 111500 }, { "epoch": 0.316620304407807, "grad_norm": 1.9338701963424683, "learning_rate": 4.472299492653655e-05, "loss": 0.6812, "step": 112000 }, { "epoch": 0.3180337879096275, "grad_norm": 1.909101963043213, "learning_rate": 4.4699436868172877e-05, "loss": 0.6911, "step": 112500 }, { "epoch": 0.3194472714114481, "grad_norm": 2.0604467391967773, "learning_rate": 4.46758788098092e-05, "loss": 0.6921, "step": 113000 }, { "epoch": 0.32086075491326865, "grad_norm": 1.9924187660217285, "learning_rate": 4.4652320751445524e-05, "loss": 0.6899, "step": 113500 }, { "epoch": 0.32227423841508923, "grad_norm": 1.9027316570281982, "learning_rate": 4.462876269308185e-05, "loss": 0.6908, "step": 114000 }, { "epoch": 0.3236877219169098, "grad_norm": 2.06449031829834, "learning_rate": 4.460520463471817e-05, "loss": 0.688, "step": 114500 }, { "epoch": 0.32510120541873033, "grad_norm": 1.735564112663269, "learning_rate": 4.4581646576354495e-05, "loss": 0.6938, "step": 115000 }, { "epoch": 0.3265146889205509, "grad_norm": 2.034191370010376, "learning_rate": 4.455808851799082e-05, "loss": 0.6902, "step": 115500 }, { "epoch": 0.3279281724223715, "grad_norm": 1.934531569480896, "learning_rate": 4.453453045962714e-05, "loss": 0.6931, "step": 116000 }, { "epoch": 0.32934165592419207, "grad_norm": 2.003404378890991, "learning_rate": 4.4510972401263466e-05, "loss": 0.6924, "step": 116500 }, { "epoch": 0.33075513942601265, "grad_norm": 1.9029284715652466, "learning_rate": 4.448741434289979e-05, "loss": 0.6914, "step": 117000 }, { "epoch": 0.33216862292783317, "grad_norm": 1.8947986364364624, "learning_rate": 4.446385628453611e-05, "loss": 0.6918, "step": 117500 }, { "epoch": 0.33358210642965375, "grad_norm": 1.7989732027053833, "learning_rate": 4.444029822617244e-05, "loss": 0.6903, "step": 118000 }, { "epoch": 0.3349955899314743, "grad_norm": 1.8779282569885254, "learning_rate": 4.441674016780876e-05, "loss": 0.6878, "step": 118500 }, { "epoch": 0.3364090734332949, "grad_norm": 1.9463249444961548, "learning_rate": 4.4393182109445084e-05, "loss": 0.6904, "step": 119000 }, { "epoch": 0.3378225569351154, "grad_norm": 2.168252468109131, "learning_rate": 4.436962405108141e-05, "loss": 0.6932, "step": 119500 }, { "epoch": 0.339236040436936, "grad_norm": 2.0377187728881836, "learning_rate": 4.434606599271773e-05, "loss": 0.6858, "step": 120000 }, { "epoch": 0.3406495239387566, "grad_norm": 1.8659805059432983, "learning_rate": 4.4322507934354055e-05, "loss": 0.6869, "step": 120500 }, { "epoch": 0.34206300744057716, "grad_norm": 1.952498197555542, "learning_rate": 4.429894987599038e-05, "loss": 0.683, "step": 121000 }, { "epoch": 0.34347649094239774, "grad_norm": 2.0258853435516357, "learning_rate": 4.42753918176267e-05, "loss": 0.6857, "step": 121500 }, { "epoch": 0.34488997444421826, "grad_norm": 2.025146722793579, "learning_rate": 4.4251833759263026e-05, "loss": 0.6833, "step": 122000 }, { "epoch": 0.34630345794603884, "grad_norm": 1.9345664978027344, "learning_rate": 4.422827570089935e-05, "loss": 0.6866, "step": 122500 }, { "epoch": 0.3477169414478594, "grad_norm": 1.9523334503173828, "learning_rate": 4.4204717642535674e-05, "loss": 0.6835, "step": 123000 }, { "epoch": 0.34913042494968, "grad_norm": 1.9960036277770996, "learning_rate": 4.4181159584172004e-05, "loss": 0.682, "step": 123500 }, { "epoch": 0.3505439084515006, "grad_norm": 1.8636521100997925, "learning_rate": 4.415760152580833e-05, "loss": 0.6809, "step": 124000 }, { "epoch": 0.3519573919533211, "grad_norm": 1.9780980348587036, "learning_rate": 4.413404346744465e-05, "loss": 0.6837, "step": 124500 }, { "epoch": 0.3533708754551417, "grad_norm": 1.851354718208313, "learning_rate": 4.4110485409080975e-05, "loss": 0.6808, "step": 125000 }, { "epoch": 0.35478435895696225, "grad_norm": 1.980684518814087, "learning_rate": 4.40869273507173e-05, "loss": 0.6879, "step": 125500 }, { "epoch": 0.35619784245878283, "grad_norm": 1.9133230447769165, "learning_rate": 4.406336929235362e-05, "loss": 0.6768, "step": 126000 }, { "epoch": 0.3576113259606034, "grad_norm": 1.9109588861465454, "learning_rate": 4.4039811233989946e-05, "loss": 0.6829, "step": 126500 }, { "epoch": 0.35902480946242393, "grad_norm": 1.8854764699935913, "learning_rate": 4.401625317562627e-05, "loss": 0.6804, "step": 127000 }, { "epoch": 0.3604382929642445, "grad_norm": 2.521974802017212, "learning_rate": 4.3992695117262594e-05, "loss": 0.682, "step": 127500 }, { "epoch": 0.3618517764660651, "grad_norm": 2.048243522644043, "learning_rate": 4.396913705889892e-05, "loss": 0.6816, "step": 128000 }, { "epoch": 0.36326525996788567, "grad_norm": 1.901122808456421, "learning_rate": 4.394557900053524e-05, "loss": 0.6805, "step": 128500 }, { "epoch": 0.36467874346970625, "grad_norm": 1.823983907699585, "learning_rate": 4.3922020942171565e-05, "loss": 0.6765, "step": 129000 }, { "epoch": 0.36609222697152677, "grad_norm": 1.859591007232666, "learning_rate": 4.389846288380789e-05, "loss": 0.6743, "step": 129500 }, { "epoch": 0.36750571047334735, "grad_norm": 1.9652605056762695, "learning_rate": 4.387490482544421e-05, "loss": 0.6785, "step": 130000 }, { "epoch": 0.3689191939751679, "grad_norm": 1.9083929061889648, "learning_rate": 4.3851346767080536e-05, "loss": 0.6829, "step": 130500 }, { "epoch": 0.3703326774769885, "grad_norm": 1.6793047189712524, "learning_rate": 4.382778870871686e-05, "loss": 0.6788, "step": 131000 }, { "epoch": 0.3717461609788091, "grad_norm": 2.0139853954315186, "learning_rate": 4.380423065035318e-05, "loss": 0.676, "step": 131500 }, { "epoch": 0.3731596444806296, "grad_norm": 2.1305205821990967, "learning_rate": 4.3780672591989507e-05, "loss": 0.6775, "step": 132000 }, { "epoch": 0.3745731279824502, "grad_norm": 3.1957545280456543, "learning_rate": 4.375711453362583e-05, "loss": 0.6771, "step": 132500 }, { "epoch": 0.37598661148427076, "grad_norm": 1.8535666465759277, "learning_rate": 4.3733556475262154e-05, "loss": 0.676, "step": 133000 }, { "epoch": 0.37740009498609134, "grad_norm": 1.7474366426467896, "learning_rate": 4.370999841689848e-05, "loss": 0.6697, "step": 133500 }, { "epoch": 0.37881357848791186, "grad_norm": 1.8589524030685425, "learning_rate": 4.36864403585348e-05, "loss": 0.6718, "step": 134000 }, { "epoch": 0.38022706198973244, "grad_norm": 1.9658094644546509, "learning_rate": 4.3662882300171125e-05, "loss": 0.6729, "step": 134500 }, { "epoch": 0.381640545491553, "grad_norm": 2.072741746902466, "learning_rate": 4.363932424180745e-05, "loss": 0.6713, "step": 135000 }, { "epoch": 0.3830540289933736, "grad_norm": 1.9752076864242554, "learning_rate": 4.361576618344377e-05, "loss": 0.671, "step": 135500 }, { "epoch": 0.3844675124951942, "grad_norm": 1.813603162765503, "learning_rate": 4.3592208125080096e-05, "loss": 0.6758, "step": 136000 }, { "epoch": 0.3858809959970147, "grad_norm": 1.9229671955108643, "learning_rate": 4.356865006671642e-05, "loss": 0.672, "step": 136500 }, { "epoch": 0.3872944794988353, "grad_norm": 1.8557090759277344, "learning_rate": 4.354509200835274e-05, "loss": 0.6727, "step": 137000 }, { "epoch": 0.38870796300065585, "grad_norm": 1.9235056638717651, "learning_rate": 4.352153394998907e-05, "loss": 0.6772, "step": 137500 }, { "epoch": 0.39012144650247643, "grad_norm": 1.8372869491577148, "learning_rate": 4.34979758916254e-05, "loss": 0.6731, "step": 138000 }, { "epoch": 0.391534930004297, "grad_norm": 3.1190271377563477, "learning_rate": 4.347441783326172e-05, "loss": 0.6715, "step": 138500 }, { "epoch": 0.39294841350611753, "grad_norm": 1.8520714044570923, "learning_rate": 4.3450859774898045e-05, "loss": 0.6695, "step": 139000 }, { "epoch": 0.3943618970079381, "grad_norm": 1.9785873889923096, "learning_rate": 4.342730171653437e-05, "loss": 0.6669, "step": 139500 }, { "epoch": 0.3957753805097587, "grad_norm": 1.9427213668823242, "learning_rate": 4.340374365817069e-05, "loss": 0.6657, "step": 140000 }, { "epoch": 0.39718886401157927, "grad_norm": 1.9251227378845215, "learning_rate": 4.3380185599807016e-05, "loss": 0.6653, "step": 140500 }, { "epoch": 0.39860234751339985, "grad_norm": 1.799850583076477, "learning_rate": 4.335662754144334e-05, "loss": 0.6684, "step": 141000 }, { "epoch": 0.40001583101522037, "grad_norm": 1.8843568563461304, "learning_rate": 4.333306948307966e-05, "loss": 0.6663, "step": 141500 }, { "epoch": 0.40142931451704095, "grad_norm": 1.9539752006530762, "learning_rate": 4.330951142471599e-05, "loss": 0.6683, "step": 142000 }, { "epoch": 0.4028427980188615, "grad_norm": 1.9506709575653076, "learning_rate": 4.328595336635231e-05, "loss": 0.6697, "step": 142500 }, { "epoch": 0.4042562815206821, "grad_norm": 2.0598342418670654, "learning_rate": 4.3262395307988634e-05, "loss": 0.6688, "step": 143000 }, { "epoch": 0.4056697650225027, "grad_norm": 2.057126998901367, "learning_rate": 4.323883724962496e-05, "loss": 0.6661, "step": 143500 }, { "epoch": 0.4070832485243232, "grad_norm": 1.8781315088272095, "learning_rate": 4.321527919126128e-05, "loss": 0.6649, "step": 144000 }, { "epoch": 0.4084967320261438, "grad_norm": 1.768463134765625, "learning_rate": 4.3191721132897605e-05, "loss": 0.6617, "step": 144500 }, { "epoch": 0.40991021552796436, "grad_norm": 1.9361228942871094, "learning_rate": 4.316816307453393e-05, "loss": 0.6635, "step": 145000 }, { "epoch": 0.41132369902978494, "grad_norm": 2.1257829666137695, "learning_rate": 4.314460501617025e-05, "loss": 0.6635, "step": 145500 }, { "epoch": 0.4127371825316055, "grad_norm": 2.0011348724365234, "learning_rate": 4.3121046957806576e-05, "loss": 0.6676, "step": 146000 }, { "epoch": 0.41415066603342604, "grad_norm": 2.064908504486084, "learning_rate": 4.30974888994429e-05, "loss": 0.6655, "step": 146500 }, { "epoch": 0.4155641495352466, "grad_norm": 2.05683970451355, "learning_rate": 4.3073930841079224e-05, "loss": 0.6624, "step": 147000 }, { "epoch": 0.4169776330370672, "grad_norm": 1.9516276121139526, "learning_rate": 4.305037278271555e-05, "loss": 0.6567, "step": 147500 }, { "epoch": 0.4183911165388878, "grad_norm": 2.1104369163513184, "learning_rate": 4.302681472435187e-05, "loss": 0.6631, "step": 148000 }, { "epoch": 0.4198046000407083, "grad_norm": 1.9445232152938843, "learning_rate": 4.3003256665988195e-05, "loss": 0.667, "step": 148500 }, { "epoch": 0.4212180835425289, "grad_norm": 1.9183613061904907, "learning_rate": 4.297969860762452e-05, "loss": 0.6614, "step": 149000 }, { "epoch": 0.42263156704434945, "grad_norm": 1.9330081939697266, "learning_rate": 4.295614054926084e-05, "loss": 0.6622, "step": 149500 }, { "epoch": 0.42404505054617003, "grad_norm": 1.8132728338241577, "learning_rate": 4.2932582490897166e-05, "loss": 0.6614, "step": 150000 }, { "epoch": 0.4254585340479906, "grad_norm": 2.0082554817199707, "learning_rate": 4.290902443253349e-05, "loss": 0.6605, "step": 150500 }, { "epoch": 0.42687201754981113, "grad_norm": 2.0493431091308594, "learning_rate": 4.288546637416981e-05, "loss": 0.6575, "step": 151000 }, { "epoch": 0.4282855010516317, "grad_norm": 1.9745382070541382, "learning_rate": 4.286190831580614e-05, "loss": 0.6649, "step": 151500 }, { "epoch": 0.4296989845534523, "grad_norm": 1.9078614711761475, "learning_rate": 4.283835025744246e-05, "loss": 0.6611, "step": 152000 }, { "epoch": 0.43111246805527287, "grad_norm": 1.8615186214447021, "learning_rate": 4.281479219907879e-05, "loss": 0.6625, "step": 152500 }, { "epoch": 0.43252595155709345, "grad_norm": 2.0308945178985596, "learning_rate": 4.2791234140715114e-05, "loss": 0.6599, "step": 153000 }, { "epoch": 0.43393943505891397, "grad_norm": 1.7711007595062256, "learning_rate": 4.276767608235144e-05, "loss": 0.6577, "step": 153500 }, { "epoch": 0.43535291856073455, "grad_norm": 1.8438256978988647, "learning_rate": 4.274411802398776e-05, "loss": 0.6573, "step": 154000 }, { "epoch": 0.4367664020625551, "grad_norm": 1.9787267446517944, "learning_rate": 4.2720559965624085e-05, "loss": 0.6611, "step": 154500 }, { "epoch": 0.4381798855643757, "grad_norm": 2.112440347671509, "learning_rate": 4.269700190726041e-05, "loss": 0.6596, "step": 155000 }, { "epoch": 0.4395933690661963, "grad_norm": 1.723192811012268, "learning_rate": 4.267344384889673e-05, "loss": 0.6603, "step": 155500 }, { "epoch": 0.4410068525680168, "grad_norm": 1.8361139297485352, "learning_rate": 4.2649885790533057e-05, "loss": 0.6576, "step": 156000 }, { "epoch": 0.4424203360698374, "grad_norm": 1.7655699253082275, "learning_rate": 4.262632773216938e-05, "loss": 0.6579, "step": 156500 }, { "epoch": 0.44383381957165796, "grad_norm": 1.933007001876831, "learning_rate": 4.2602769673805704e-05, "loss": 0.6536, "step": 157000 }, { "epoch": 0.44524730307347854, "grad_norm": 1.941925048828125, "learning_rate": 4.257921161544203e-05, "loss": 0.6623, "step": 157500 }, { "epoch": 0.4466607865752991, "grad_norm": 1.913564682006836, "learning_rate": 4.255565355707835e-05, "loss": 0.6494, "step": 158000 }, { "epoch": 0.44807427007711964, "grad_norm": 2.0821611881256104, "learning_rate": 4.2532095498714675e-05, "loss": 0.6531, "step": 158500 }, { "epoch": 0.4494877535789402, "grad_norm": 1.893485426902771, "learning_rate": 4.2508537440351e-05, "loss": 0.6548, "step": 159000 }, { "epoch": 0.4509012370807608, "grad_norm": 1.8955482244491577, "learning_rate": 4.248497938198732e-05, "loss": 0.6547, "step": 159500 }, { "epoch": 0.4523147205825814, "grad_norm": 1.798446536064148, "learning_rate": 4.2461421323623646e-05, "loss": 0.6545, "step": 160000 }, { "epoch": 0.45372820408440195, "grad_norm": 1.868611216545105, "learning_rate": 4.243786326525997e-05, "loss": 0.6507, "step": 160500 }, { "epoch": 0.4551416875862225, "grad_norm": 2.1023874282836914, "learning_rate": 4.241430520689629e-05, "loss": 0.6582, "step": 161000 }, { "epoch": 0.45655517108804305, "grad_norm": 2.115084648132324, "learning_rate": 4.239074714853262e-05, "loss": 0.6537, "step": 161500 }, { "epoch": 0.45796865458986363, "grad_norm": 2.0992658138275146, "learning_rate": 4.236718909016894e-05, "loss": 0.6491, "step": 162000 }, { "epoch": 0.4593821380916842, "grad_norm": 1.8496648073196411, "learning_rate": 4.2343631031805264e-05, "loss": 0.6574, "step": 162500 }, { "epoch": 0.46079562159350473, "grad_norm": 1.8616197109222412, "learning_rate": 4.232007297344159e-05, "loss": 0.6496, "step": 163000 }, { "epoch": 0.4622091050953253, "grad_norm": 1.9944509267807007, "learning_rate": 4.229651491507791e-05, "loss": 0.6501, "step": 163500 }, { "epoch": 0.4636225885971459, "grad_norm": 1.870962142944336, "learning_rate": 4.2272956856714235e-05, "loss": 0.6569, "step": 164000 }, { "epoch": 0.46503607209896647, "grad_norm": 2.0562937259674072, "learning_rate": 4.224939879835056e-05, "loss": 0.6482, "step": 164500 }, { "epoch": 0.46644955560078705, "grad_norm": 1.9955099821090698, "learning_rate": 4.222584073998688e-05, "loss": 0.6532, "step": 165000 }, { "epoch": 0.46786303910260757, "grad_norm": 1.9783668518066406, "learning_rate": 4.2202282681623206e-05, "loss": 0.6524, "step": 165500 }, { "epoch": 0.46927652260442815, "grad_norm": 1.9404637813568115, "learning_rate": 4.217872462325953e-05, "loss": 0.655, "step": 166000 }, { "epoch": 0.4706900061062487, "grad_norm": 2.0397632122039795, "learning_rate": 4.2155166564895854e-05, "loss": 0.6493, "step": 166500 }, { "epoch": 0.4721034896080693, "grad_norm": 2.070631980895996, "learning_rate": 4.2131608506532184e-05, "loss": 0.6535, "step": 167000 }, { "epoch": 0.4735169731098899, "grad_norm": 2.1221063137054443, "learning_rate": 4.210805044816851e-05, "loss": 0.6533, "step": 167500 }, { "epoch": 0.4749304566117104, "grad_norm": 1.9808456897735596, "learning_rate": 4.208449238980483e-05, "loss": 0.6507, "step": 168000 }, { "epoch": 0.476343940113531, "grad_norm": 1.9055386781692505, "learning_rate": 4.2060934331441155e-05, "loss": 0.6458, "step": 168500 }, { "epoch": 0.47775742361535156, "grad_norm": 1.8637745380401611, "learning_rate": 4.203737627307748e-05, "loss": 0.6518, "step": 169000 }, { "epoch": 0.47917090711717214, "grad_norm": 1.8349119424819946, "learning_rate": 4.20138182147138e-05, "loss": 0.6531, "step": 169500 }, { "epoch": 0.4805843906189927, "grad_norm": 1.9962233304977417, "learning_rate": 4.1990260156350126e-05, "loss": 0.6482, "step": 170000 }, { "epoch": 0.48199787412081324, "grad_norm": 1.974079966545105, "learning_rate": 4.196670209798645e-05, "loss": 0.6481, "step": 170500 }, { "epoch": 0.4834113576226338, "grad_norm": 1.9090614318847656, "learning_rate": 4.1943144039622774e-05, "loss": 0.6497, "step": 171000 }, { "epoch": 0.4848248411244544, "grad_norm": 1.8768260478973389, "learning_rate": 4.19195859812591e-05, "loss": 0.6456, "step": 171500 }, { "epoch": 0.486238324626275, "grad_norm": 2.0779778957366943, "learning_rate": 4.189602792289542e-05, "loss": 0.6495, "step": 172000 }, { "epoch": 0.48765180812809555, "grad_norm": 1.9067360162734985, "learning_rate": 4.1872469864531745e-05, "loss": 0.6549, "step": 172500 }, { "epoch": 0.4890652916299161, "grad_norm": 1.8777134418487549, "learning_rate": 4.184891180616807e-05, "loss": 0.6494, "step": 173000 }, { "epoch": 0.49047877513173666, "grad_norm": 1.7986242771148682, "learning_rate": 4.182535374780439e-05, "loss": 0.6464, "step": 173500 }, { "epoch": 0.49189225863355723, "grad_norm": 1.9598702192306519, "learning_rate": 4.1801795689440716e-05, "loss": 0.6453, "step": 174000 }, { "epoch": 0.4933057421353778, "grad_norm": 1.8181840181350708, "learning_rate": 4.177823763107704e-05, "loss": 0.6464, "step": 174500 }, { "epoch": 0.4947192256371984, "grad_norm": 1.6891690492630005, "learning_rate": 4.175467957271336e-05, "loss": 0.6468, "step": 175000 }, { "epoch": 0.4961327091390189, "grad_norm": 1.9664174318313599, "learning_rate": 4.1731121514349687e-05, "loss": 0.6461, "step": 175500 }, { "epoch": 0.4975461926408395, "grad_norm": 1.946211814880371, "learning_rate": 4.170756345598601e-05, "loss": 0.6473, "step": 176000 }, { "epoch": 0.49895967614266007, "grad_norm": 1.8292622566223145, "learning_rate": 4.1684005397622334e-05, "loss": 0.6487, "step": 176500 }, { "epoch": 0.5003731596444806, "grad_norm": 1.8863071203231812, "learning_rate": 4.166044733925866e-05, "loss": 0.6467, "step": 177000 }, { "epoch": 0.5017866431463012, "grad_norm": 2.053734302520752, "learning_rate": 4.163688928089498e-05, "loss": 0.6378, "step": 177500 }, { "epoch": 0.5032001266481217, "grad_norm": 1.848065972328186, "learning_rate": 4.1613331222531305e-05, "loss": 0.6474, "step": 178000 }, { "epoch": 0.5046136101499423, "grad_norm": 1.9176794290542603, "learning_rate": 4.158977316416763e-05, "loss": 0.6427, "step": 178500 }, { "epoch": 0.5060270936517629, "grad_norm": 1.985337734222412, "learning_rate": 4.156621510580395e-05, "loss": 0.6379, "step": 179000 }, { "epoch": 0.5074405771535835, "grad_norm": 2.222973346710205, "learning_rate": 4.1542657047440276e-05, "loss": 0.643, "step": 179500 }, { "epoch": 0.5088540606554041, "grad_norm": 1.9308265447616577, "learning_rate": 4.15190989890766e-05, "loss": 0.6424, "step": 180000 }, { "epoch": 0.5102675441572246, "grad_norm": 1.9207767248153687, "learning_rate": 4.149554093071292e-05, "loss": 0.6461, "step": 180500 }, { "epoch": 0.5116810276590452, "grad_norm": 1.790248990058899, "learning_rate": 4.147198287234925e-05, "loss": 0.642, "step": 181000 }, { "epoch": 0.5130945111608657, "grad_norm": 2.056457042694092, "learning_rate": 4.144842481398558e-05, "loss": 0.639, "step": 181500 }, { "epoch": 0.5145079946626863, "grad_norm": 1.7488899230957031, "learning_rate": 4.14248667556219e-05, "loss": 0.6395, "step": 182000 }, { "epoch": 0.5159214781645068, "grad_norm": 1.9955261945724487, "learning_rate": 4.1401308697258225e-05, "loss": 0.645, "step": 182500 }, { "epoch": 0.5173349616663274, "grad_norm": 1.9219900369644165, "learning_rate": 4.137775063889455e-05, "loss": 0.6412, "step": 183000 }, { "epoch": 0.518748445168148, "grad_norm": 1.9105889797210693, "learning_rate": 4.135419258053087e-05, "loss": 0.6412, "step": 183500 }, { "epoch": 0.5201619286699686, "grad_norm": 1.8840619325637817, "learning_rate": 4.1330634522167196e-05, "loss": 0.6358, "step": 184000 }, { "epoch": 0.5215754121717892, "grad_norm": 2.1773383617401123, "learning_rate": 4.130707646380352e-05, "loss": 0.6357, "step": 184500 }, { "epoch": 0.5229888956736097, "grad_norm": 6.703322887420654, "learning_rate": 4.128351840543984e-05, "loss": 0.6427, "step": 185000 }, { "epoch": 0.5244023791754303, "grad_norm": 1.8201607465744019, "learning_rate": 4.125996034707617e-05, "loss": 0.6426, "step": 185500 }, { "epoch": 0.5258158626772508, "grad_norm": 1.88836669921875, "learning_rate": 4.123640228871249e-05, "loss": 0.6375, "step": 186000 }, { "epoch": 0.5272293461790714, "grad_norm": 1.9892611503601074, "learning_rate": 4.1212844230348814e-05, "loss": 0.6396, "step": 186500 }, { "epoch": 0.5286428296808919, "grad_norm": 1.816945195198059, "learning_rate": 4.118928617198514e-05, "loss": 0.6406, "step": 187000 }, { "epoch": 0.5300563131827125, "grad_norm": 1.9256807565689087, "learning_rate": 4.116572811362146e-05, "loss": 0.6391, "step": 187500 }, { "epoch": 0.5314697966845331, "grad_norm": 1.913210391998291, "learning_rate": 4.1142170055257785e-05, "loss": 0.6391, "step": 188000 }, { "epoch": 0.5328832801863537, "grad_norm": 2.127396583557129, "learning_rate": 4.111861199689411e-05, "loss": 0.6364, "step": 188500 }, { "epoch": 0.5342967636881742, "grad_norm": 1.8560317754745483, "learning_rate": 4.109505393853043e-05, "loss": 0.6361, "step": 189000 }, { "epoch": 0.5357102471899948, "grad_norm": 1.8964972496032715, "learning_rate": 4.1071495880166756e-05, "loss": 0.6378, "step": 189500 }, { "epoch": 0.5371237306918154, "grad_norm": 2.1880860328674316, "learning_rate": 4.104793782180308e-05, "loss": 0.6381, "step": 190000 }, { "epoch": 0.538537214193636, "grad_norm": 1.8322962522506714, "learning_rate": 4.1024379763439404e-05, "loss": 0.6384, "step": 190500 }, { "epoch": 0.5399506976954564, "grad_norm": 1.8627541065216064, "learning_rate": 4.100082170507573e-05, "loss": 0.6392, "step": 191000 }, { "epoch": 0.541364181197277, "grad_norm": 1.977951169013977, "learning_rate": 4.097726364671205e-05, "loss": 0.6385, "step": 191500 }, { "epoch": 0.5427776646990976, "grad_norm": 1.9309113025665283, "learning_rate": 4.0953705588348375e-05, "loss": 0.6378, "step": 192000 }, { "epoch": 0.5441911482009182, "grad_norm": 1.9884194135665894, "learning_rate": 4.09301475299847e-05, "loss": 0.6371, "step": 192500 }, { "epoch": 0.5456046317027388, "grad_norm": 1.9028576612472534, "learning_rate": 4.090658947162102e-05, "loss": 0.6344, "step": 193000 }, { "epoch": 0.5470181152045593, "grad_norm": 1.8197699785232544, "learning_rate": 4.0883031413257346e-05, "loss": 0.6409, "step": 193500 }, { "epoch": 0.5484315987063799, "grad_norm": 1.6265079975128174, "learning_rate": 4.085947335489367e-05, "loss": 0.6385, "step": 194000 }, { "epoch": 0.5498450822082005, "grad_norm": 1.931383490562439, "learning_rate": 4.083591529652999e-05, "loss": 0.6357, "step": 194500 }, { "epoch": 0.5512585657100211, "grad_norm": 1.8238074779510498, "learning_rate": 4.081235723816632e-05, "loss": 0.6316, "step": 195000 }, { "epoch": 0.5526720492118417, "grad_norm": 1.8526809215545654, "learning_rate": 4.078879917980264e-05, "loss": 0.6365, "step": 195500 }, { "epoch": 0.5540855327136621, "grad_norm": 1.812662124633789, "learning_rate": 4.076524112143897e-05, "loss": 0.6327, "step": 196000 }, { "epoch": 0.5554990162154827, "grad_norm": 1.8198567628860474, "learning_rate": 4.0741683063075294e-05, "loss": 0.6316, "step": 196500 }, { "epoch": 0.5569124997173033, "grad_norm": 2.0398592948913574, "learning_rate": 4.071812500471162e-05, "loss": 0.6314, "step": 197000 }, { "epoch": 0.5583259832191239, "grad_norm": 1.874341607093811, "learning_rate": 4.069456694634794e-05, "loss": 0.6304, "step": 197500 }, { "epoch": 0.5597394667209444, "grad_norm": 1.8874528408050537, "learning_rate": 4.0671008887984265e-05, "loss": 0.6303, "step": 198000 }, { "epoch": 0.561152950222765, "grad_norm": 1.956292748451233, "learning_rate": 4.064745082962059e-05, "loss": 0.6312, "step": 198500 }, { "epoch": 0.5625664337245856, "grad_norm": 1.8418341875076294, "learning_rate": 4.062389277125691e-05, "loss": 0.6319, "step": 199000 }, { "epoch": 0.5639799172264062, "grad_norm": 2.0802693367004395, "learning_rate": 4.0600334712893236e-05, "loss": 0.634, "step": 199500 }, { "epoch": 0.5653934007282267, "grad_norm": 2.0167183876037598, "learning_rate": 4.057677665452956e-05, "loss": 0.6343, "step": 200000 }, { "epoch": 0.5668068842300472, "grad_norm": 3.3036744594573975, "learning_rate": 4.0553218596165884e-05, "loss": 0.6306, "step": 200500 }, { "epoch": 0.5682203677318678, "grad_norm": 2.351076364517212, "learning_rate": 4.052966053780221e-05, "loss": 0.6299, "step": 201000 }, { "epoch": 0.5696338512336884, "grad_norm": 1.8994574546813965, "learning_rate": 4.050610247943853e-05, "loss": 0.6297, "step": 201500 }, { "epoch": 0.571047334735509, "grad_norm": 1.8442100286483765, "learning_rate": 4.0482544421074855e-05, "loss": 0.6328, "step": 202000 }, { "epoch": 0.5724608182373295, "grad_norm": 1.8059977293014526, "learning_rate": 4.045898636271118e-05, "loss": 0.6303, "step": 202500 }, { "epoch": 0.5738743017391501, "grad_norm": 1.9043519496917725, "learning_rate": 4.04354283043475e-05, "loss": 0.6317, "step": 203000 }, { "epoch": 0.5752877852409707, "grad_norm": 1.8900419473648071, "learning_rate": 4.0411870245983826e-05, "loss": 0.63, "step": 203500 }, { "epoch": 0.5767012687427913, "grad_norm": 1.9645659923553467, "learning_rate": 4.038831218762015e-05, "loss": 0.6323, "step": 204000 }, { "epoch": 0.5781147522446118, "grad_norm": 1.8183231353759766, "learning_rate": 4.036475412925647e-05, "loss": 0.6343, "step": 204500 }, { "epoch": 0.5795282357464324, "grad_norm": 1.7812124490737915, "learning_rate": 4.03411960708928e-05, "loss": 0.6283, "step": 205000 }, { "epoch": 0.5809417192482529, "grad_norm": 1.8729110956192017, "learning_rate": 4.031763801252912e-05, "loss": 0.6338, "step": 205500 }, { "epoch": 0.5823552027500735, "grad_norm": 2.014174222946167, "learning_rate": 4.0294079954165444e-05, "loss": 0.6328, "step": 206000 }, { "epoch": 0.583768686251894, "grad_norm": 2.0063443183898926, "learning_rate": 4.027052189580177e-05, "loss": 0.6299, "step": 206500 }, { "epoch": 0.5851821697537146, "grad_norm": 1.8113768100738525, "learning_rate": 4.024696383743809e-05, "loss": 0.6325, "step": 207000 }, { "epoch": 0.5865956532555352, "grad_norm": 1.9672755002975464, "learning_rate": 4.0223405779074415e-05, "loss": 0.6296, "step": 207500 }, { "epoch": 0.5880091367573558, "grad_norm": 1.819535732269287, "learning_rate": 4.019984772071074e-05, "loss": 0.6294, "step": 208000 }, { "epoch": 0.5894226202591764, "grad_norm": 1.7239525318145752, "learning_rate": 4.017628966234706e-05, "loss": 0.6282, "step": 208500 }, { "epoch": 0.5908361037609969, "grad_norm": 1.8155593872070312, "learning_rate": 4.0152731603983386e-05, "loss": 0.6282, "step": 209000 }, { "epoch": 0.5922495872628175, "grad_norm": 1.9770705699920654, "learning_rate": 4.012917354561971e-05, "loss": 0.6293, "step": 209500 }, { "epoch": 0.5936630707646381, "grad_norm": 1.8571830987930298, "learning_rate": 4.0105615487256034e-05, "loss": 0.6268, "step": 210000 }, { "epoch": 0.5950765542664586, "grad_norm": 1.8360346555709839, "learning_rate": 4.0082057428892364e-05, "loss": 0.6293, "step": 210500 }, { "epoch": 0.5964900377682791, "grad_norm": 1.8974744081497192, "learning_rate": 4.005849937052869e-05, "loss": 0.6298, "step": 211000 }, { "epoch": 0.5979035212700997, "grad_norm": 2.187601089477539, "learning_rate": 4.003494131216501e-05, "loss": 0.6269, "step": 211500 }, { "epoch": 0.5993170047719203, "grad_norm": 1.801985740661621, "learning_rate": 4.0011383253801335e-05, "loss": 0.6285, "step": 212000 }, { "epoch": 0.6007304882737409, "grad_norm": 1.7649908065795898, "learning_rate": 3.998782519543766e-05, "loss": 0.6261, "step": 212500 }, { "epoch": 0.6021439717755614, "grad_norm": 1.7635537385940552, "learning_rate": 3.996426713707398e-05, "loss": 0.6306, "step": 213000 }, { "epoch": 0.603557455277382, "grad_norm": 1.8965436220169067, "learning_rate": 3.9940709078710306e-05, "loss": 0.6267, "step": 213500 }, { "epoch": 0.6049709387792026, "grad_norm": 2.024097442626953, "learning_rate": 3.991715102034663e-05, "loss": 0.6276, "step": 214000 }, { "epoch": 0.6063844222810232, "grad_norm": 1.8323618173599243, "learning_rate": 3.9893592961982953e-05, "loss": 0.6257, "step": 214500 }, { "epoch": 0.6077979057828437, "grad_norm": 2.0031492710113525, "learning_rate": 3.987003490361928e-05, "loss": 0.6282, "step": 215000 }, { "epoch": 0.6092113892846642, "grad_norm": 1.821746826171875, "learning_rate": 3.98464768452556e-05, "loss": 0.6257, "step": 215500 }, { "epoch": 0.6106248727864848, "grad_norm": 1.7302824258804321, "learning_rate": 3.9822918786891925e-05, "loss": 0.6262, "step": 216000 }, { "epoch": 0.6120383562883054, "grad_norm": 1.8548214435577393, "learning_rate": 3.979936072852825e-05, "loss": 0.6229, "step": 216500 }, { "epoch": 0.613451839790126, "grad_norm": 1.9032096862792969, "learning_rate": 3.977580267016457e-05, "loss": 0.6267, "step": 217000 }, { "epoch": 0.6148653232919465, "grad_norm": 1.8552892208099365, "learning_rate": 3.9752244611800896e-05, "loss": 0.6274, "step": 217500 }, { "epoch": 0.6162788067937671, "grad_norm": 1.812547206878662, "learning_rate": 3.972868655343722e-05, "loss": 0.6212, "step": 218000 }, { "epoch": 0.6176922902955877, "grad_norm": 1.836055874824524, "learning_rate": 3.970512849507354e-05, "loss": 0.6218, "step": 218500 }, { "epoch": 0.6191057737974083, "grad_norm": 1.8733906745910645, "learning_rate": 3.9681570436709867e-05, "loss": 0.6251, "step": 219000 }, { "epoch": 0.6205192572992289, "grad_norm": 1.8623510599136353, "learning_rate": 3.965801237834619e-05, "loss": 0.6259, "step": 219500 }, { "epoch": 0.6219327408010493, "grad_norm": 1.8023607730865479, "learning_rate": 3.9634454319982514e-05, "loss": 0.624, "step": 220000 }, { "epoch": 0.6233462243028699, "grad_norm": 1.9083937406539917, "learning_rate": 3.961089626161884e-05, "loss": 0.6237, "step": 220500 }, { "epoch": 0.6247597078046905, "grad_norm": 2.021145820617676, "learning_rate": 3.958733820325516e-05, "loss": 0.6175, "step": 221000 }, { "epoch": 0.6261731913065111, "grad_norm": 1.871703863143921, "learning_rate": 3.9563780144891485e-05, "loss": 0.6219, "step": 221500 }, { "epoch": 0.6275866748083316, "grad_norm": 1.9064445495605469, "learning_rate": 3.954022208652781e-05, "loss": 0.6199, "step": 222000 }, { "epoch": 0.6290001583101522, "grad_norm": 1.7873480319976807, "learning_rate": 3.951666402816413e-05, "loss": 0.6246, "step": 222500 }, { "epoch": 0.6304136418119728, "grad_norm": 1.9441392421722412, "learning_rate": 3.9493105969800456e-05, "loss": 0.6199, "step": 223000 }, { "epoch": 0.6318271253137934, "grad_norm": 1.8134955167770386, "learning_rate": 3.946954791143678e-05, "loss": 0.6207, "step": 223500 }, { "epoch": 0.633240608815614, "grad_norm": 2.0457184314727783, "learning_rate": 3.94459898530731e-05, "loss": 0.6202, "step": 224000 }, { "epoch": 0.6346540923174345, "grad_norm": 1.812278389930725, "learning_rate": 3.942243179470943e-05, "loss": 0.6228, "step": 224500 }, { "epoch": 0.636067575819255, "grad_norm": 2.0103724002838135, "learning_rate": 3.939887373634575e-05, "loss": 0.6208, "step": 225000 }, { "epoch": 0.6374810593210756, "grad_norm": 1.7944782972335815, "learning_rate": 3.9375315677982074e-05, "loss": 0.6156, "step": 225500 }, { "epoch": 0.6388945428228961, "grad_norm": 1.84700345993042, "learning_rate": 3.93517576196184e-05, "loss": 0.6165, "step": 226000 }, { "epoch": 0.6403080263247167, "grad_norm": 1.7678914070129395, "learning_rate": 3.932819956125472e-05, "loss": 0.6178, "step": 226500 }, { "epoch": 0.6417215098265373, "grad_norm": 2.36362624168396, "learning_rate": 3.9304641502891045e-05, "loss": 0.6221, "step": 227000 }, { "epoch": 0.6431349933283579, "grad_norm": 1.8366997241973877, "learning_rate": 3.928108344452737e-05, "loss": 0.6173, "step": 227500 }, { "epoch": 0.6445484768301785, "grad_norm": 1.9854469299316406, "learning_rate": 3.925752538616369e-05, "loss": 0.6206, "step": 228000 }, { "epoch": 0.645961960331999, "grad_norm": 1.968384027481079, "learning_rate": 3.9233967327800016e-05, "loss": 0.6194, "step": 228500 }, { "epoch": 0.6473754438338196, "grad_norm": 1.6518436670303345, "learning_rate": 3.921040926943634e-05, "loss": 0.6181, "step": 229000 }, { "epoch": 0.6487889273356401, "grad_norm": 1.896748661994934, "learning_rate": 3.9186851211072664e-05, "loss": 0.6195, "step": 229500 }, { "epoch": 0.6502024108374607, "grad_norm": 2.000025510787964, "learning_rate": 3.9163293152708994e-05, "loss": 0.6223, "step": 230000 }, { "epoch": 0.6516158943392812, "grad_norm": 1.9804794788360596, "learning_rate": 3.913973509434532e-05, "loss": 0.6199, "step": 230500 }, { "epoch": 0.6530293778411018, "grad_norm": 1.7015196084976196, "learning_rate": 3.911617703598164e-05, "loss": 0.6175, "step": 231000 }, { "epoch": 0.6544428613429224, "grad_norm": 1.8146058320999146, "learning_rate": 3.9092618977617965e-05, "loss": 0.6204, "step": 231500 }, { "epoch": 0.655856344844743, "grad_norm": 1.8514864444732666, "learning_rate": 3.906906091925429e-05, "loss": 0.6188, "step": 232000 }, { "epoch": 0.6572698283465636, "grad_norm": 1.9639760255813599, "learning_rate": 3.904550286089061e-05, "loss": 0.6167, "step": 232500 }, { "epoch": 0.6586833118483841, "grad_norm": 1.7671340703964233, "learning_rate": 3.9021944802526936e-05, "loss": 0.6194, "step": 233000 }, { "epoch": 0.6600967953502047, "grad_norm": 2.0465102195739746, "learning_rate": 3.899838674416326e-05, "loss": 0.615, "step": 233500 }, { "epoch": 0.6615102788520253, "grad_norm": 1.892221450805664, "learning_rate": 3.8974828685799584e-05, "loss": 0.6184, "step": 234000 }, { "epoch": 0.6629237623538458, "grad_norm": 1.9812071323394775, "learning_rate": 3.895127062743591e-05, "loss": 0.6167, "step": 234500 }, { "epoch": 0.6643372458556663, "grad_norm": 1.7472712993621826, "learning_rate": 3.892771256907223e-05, "loss": 0.6161, "step": 235000 }, { "epoch": 0.6657507293574869, "grad_norm": 1.9572404623031616, "learning_rate": 3.8904154510708555e-05, "loss": 0.6146, "step": 235500 }, { "epoch": 0.6671642128593075, "grad_norm": 1.8057337999343872, "learning_rate": 3.888059645234488e-05, "loss": 0.6211, "step": 236000 }, { "epoch": 0.6685776963611281, "grad_norm": 1.9051380157470703, "learning_rate": 3.88570383939812e-05, "loss": 0.6174, "step": 236500 }, { "epoch": 0.6699911798629486, "grad_norm": 1.9259700775146484, "learning_rate": 3.8833480335617526e-05, "loss": 0.6175, "step": 237000 }, { "epoch": 0.6714046633647692, "grad_norm": 2.022965908050537, "learning_rate": 3.880992227725385e-05, "loss": 0.6168, "step": 237500 }, { "epoch": 0.6728181468665898, "grad_norm": 1.841408133506775, "learning_rate": 3.878636421889017e-05, "loss": 0.6176, "step": 238000 }, { "epoch": 0.6742316303684104, "grad_norm": 2.212230682373047, "learning_rate": 3.87628061605265e-05, "loss": 0.6135, "step": 238500 }, { "epoch": 0.6756451138702309, "grad_norm": 1.9169288873672485, "learning_rate": 3.873924810216282e-05, "loss": 0.613, "step": 239000 }, { "epoch": 0.6770585973720514, "grad_norm": 1.7519197463989258, "learning_rate": 3.8715690043799144e-05, "loss": 0.6149, "step": 239500 }, { "epoch": 0.678472080873872, "grad_norm": 1.8371704816818237, "learning_rate": 3.869213198543547e-05, "loss": 0.6151, "step": 240000 }, { "epoch": 0.6798855643756926, "grad_norm": 1.9791675806045532, "learning_rate": 3.866857392707179e-05, "loss": 0.617, "step": 240500 }, { "epoch": 0.6812990478775132, "grad_norm": 1.838545799255371, "learning_rate": 3.8645015868708115e-05, "loss": 0.608, "step": 241000 }, { "epoch": 0.6827125313793337, "grad_norm": 1.8278930187225342, "learning_rate": 3.862145781034444e-05, "loss": 0.6168, "step": 241500 }, { "epoch": 0.6841260148811543, "grad_norm": 2.0241081714630127, "learning_rate": 3.859789975198076e-05, "loss": 0.6131, "step": 242000 }, { "epoch": 0.6855394983829749, "grad_norm": 2.2139530181884766, "learning_rate": 3.8574341693617086e-05, "loss": 0.6099, "step": 242500 }, { "epoch": 0.6869529818847955, "grad_norm": 1.9893624782562256, "learning_rate": 3.855078363525341e-05, "loss": 0.6102, "step": 243000 }, { "epoch": 0.688366465386616, "grad_norm": 1.8282994031906128, "learning_rate": 3.852722557688973e-05, "loss": 0.6095, "step": 243500 }, { "epoch": 0.6897799488884365, "grad_norm": 1.843912959098816, "learning_rate": 3.850366751852606e-05, "loss": 0.6158, "step": 244000 }, { "epoch": 0.6911934323902571, "grad_norm": 1.9456268548965454, "learning_rate": 3.848010946016238e-05, "loss": 0.6129, "step": 244500 }, { "epoch": 0.6926069158920777, "grad_norm": 1.7876250743865967, "learning_rate": 3.8456551401798704e-05, "loss": 0.6101, "step": 245000 }, { "epoch": 0.6940203993938983, "grad_norm": 2.283369302749634, "learning_rate": 3.843299334343503e-05, "loss": 0.6149, "step": 245500 }, { "epoch": 0.6954338828957188, "grad_norm": 2.2584314346313477, "learning_rate": 3.840943528507135e-05, "loss": 0.6139, "step": 246000 }, { "epoch": 0.6968473663975394, "grad_norm": 1.8926889896392822, "learning_rate": 3.8385877226707675e-05, "loss": 0.616, "step": 246500 }, { "epoch": 0.69826084989936, "grad_norm": 1.7500066757202148, "learning_rate": 3.8362319168344e-05, "loss": 0.6123, "step": 247000 }, { "epoch": 0.6996743334011806, "grad_norm": 1.9299042224884033, "learning_rate": 3.833876110998032e-05, "loss": 0.6134, "step": 247500 }, { "epoch": 0.7010878169030011, "grad_norm": 1.990527868270874, "learning_rate": 3.8315203051616646e-05, "loss": 0.6085, "step": 248000 }, { "epoch": 0.7025013004048217, "grad_norm": 2.1570639610290527, "learning_rate": 3.829164499325297e-05, "loss": 0.6111, "step": 248500 }, { "epoch": 0.7039147839066422, "grad_norm": 1.9779850244522095, "learning_rate": 3.8268086934889294e-05, "loss": 0.6105, "step": 249000 }, { "epoch": 0.7053282674084628, "grad_norm": 1.8519608974456787, "learning_rate": 3.824452887652562e-05, "loss": 0.613, "step": 249500 }, { "epoch": 0.7067417509102834, "grad_norm": 2.0072107315063477, "learning_rate": 3.822097081816194e-05, "loss": 0.6126, "step": 250000 }, { "epoch": 0.7081552344121039, "grad_norm": 1.892958641052246, "learning_rate": 3.8197412759798265e-05, "loss": 0.612, "step": 250500 }, { "epoch": 0.7095687179139245, "grad_norm": 2.0438249111175537, "learning_rate": 3.817385470143459e-05, "loss": 0.6058, "step": 251000 }, { "epoch": 0.7109822014157451, "grad_norm": 1.950453281402588, "learning_rate": 3.815029664307091e-05, "loss": 0.6106, "step": 251500 }, { "epoch": 0.7123956849175657, "grad_norm": 1.972099781036377, "learning_rate": 3.8126738584707236e-05, "loss": 0.609, "step": 252000 }, { "epoch": 0.7138091684193862, "grad_norm": 1.929118275642395, "learning_rate": 3.810318052634356e-05, "loss": 0.61, "step": 252500 }, { "epoch": 0.7152226519212068, "grad_norm": 1.9493205547332764, "learning_rate": 3.807962246797988e-05, "loss": 0.6124, "step": 253000 }, { "epoch": 0.7166361354230273, "grad_norm": 1.6826261281967163, "learning_rate": 3.8056064409616214e-05, "loss": 0.6067, "step": 253500 }, { "epoch": 0.7180496189248479, "grad_norm": 1.9503551721572876, "learning_rate": 3.803250635125254e-05, "loss": 0.6142, "step": 254000 }, { "epoch": 0.7194631024266684, "grad_norm": 2.0457382202148438, "learning_rate": 3.800894829288886e-05, "loss": 0.6094, "step": 254500 }, { "epoch": 0.720876585928489, "grad_norm": 1.8640727996826172, "learning_rate": 3.7985390234525185e-05, "loss": 0.6105, "step": 255000 }, { "epoch": 0.7222900694303096, "grad_norm": 1.7050703763961792, "learning_rate": 3.796183217616151e-05, "loss": 0.6125, "step": 255500 }, { "epoch": 0.7237035529321302, "grad_norm": 2.1210110187530518, "learning_rate": 3.793827411779783e-05, "loss": 0.6083, "step": 256000 }, { "epoch": 0.7251170364339508, "grad_norm": 1.8713070154190063, "learning_rate": 3.7914716059434156e-05, "loss": 0.6048, "step": 256500 }, { "epoch": 0.7265305199357713, "grad_norm": 1.875571608543396, "learning_rate": 3.789115800107048e-05, "loss": 0.6066, "step": 257000 }, { "epoch": 0.7279440034375919, "grad_norm": 1.8263658285140991, "learning_rate": 3.78675999427068e-05, "loss": 0.6083, "step": 257500 }, { "epoch": 0.7293574869394125, "grad_norm": 1.8539094924926758, "learning_rate": 3.784404188434313e-05, "loss": 0.6082, "step": 258000 }, { "epoch": 0.730770970441233, "grad_norm": 1.757389783859253, "learning_rate": 3.782048382597945e-05, "loss": 0.6097, "step": 258500 }, { "epoch": 0.7321844539430535, "grad_norm": 2.060903787612915, "learning_rate": 3.7796925767615774e-05, "loss": 0.6031, "step": 259000 }, { "epoch": 0.7335979374448741, "grad_norm": 1.7717828750610352, "learning_rate": 3.77733677092521e-05, "loss": 0.6095, "step": 259500 }, { "epoch": 0.7350114209466947, "grad_norm": 1.9142152070999146, "learning_rate": 3.774980965088842e-05, "loss": 0.6104, "step": 260000 }, { "epoch": 0.7364249044485153, "grad_norm": 1.8692725896835327, "learning_rate": 3.7726251592524745e-05, "loss": 0.6055, "step": 260500 }, { "epoch": 0.7378383879503359, "grad_norm": 1.8337864875793457, "learning_rate": 3.770269353416107e-05, "loss": 0.6099, "step": 261000 }, { "epoch": 0.7392518714521564, "grad_norm": 1.7165493965148926, "learning_rate": 3.767913547579739e-05, "loss": 0.6019, "step": 261500 }, { "epoch": 0.740665354953977, "grad_norm": 1.8660248517990112, "learning_rate": 3.7655577417433716e-05, "loss": 0.6116, "step": 262000 }, { "epoch": 0.7420788384557976, "grad_norm": 1.9784579277038574, "learning_rate": 3.763201935907004e-05, "loss": 0.6053, "step": 262500 }, { "epoch": 0.7434923219576182, "grad_norm": 1.8330358266830444, "learning_rate": 3.7608461300706363e-05, "loss": 0.6015, "step": 263000 }, { "epoch": 0.7449058054594386, "grad_norm": 2.1759350299835205, "learning_rate": 3.758490324234269e-05, "loss": 0.6066, "step": 263500 }, { "epoch": 0.7463192889612592, "grad_norm": 1.8769718408584595, "learning_rate": 3.756134518397901e-05, "loss": 0.6058, "step": 264000 }, { "epoch": 0.7477327724630798, "grad_norm": 1.7798304557800293, "learning_rate": 3.7537787125615334e-05, "loss": 0.6082, "step": 264500 }, { "epoch": 0.7491462559649004, "grad_norm": 1.895510196685791, "learning_rate": 3.751422906725166e-05, "loss": 0.6028, "step": 265000 }, { "epoch": 0.7505597394667209, "grad_norm": 1.9621906280517578, "learning_rate": 3.749067100888798e-05, "loss": 0.6059, "step": 265500 }, { "epoch": 0.7519732229685415, "grad_norm": 1.8031742572784424, "learning_rate": 3.7467112950524305e-05, "loss": 0.6017, "step": 266000 }, { "epoch": 0.7533867064703621, "grad_norm": 1.832423210144043, "learning_rate": 3.744355489216063e-05, "loss": 0.6062, "step": 266500 }, { "epoch": 0.7548001899721827, "grad_norm": 1.969154715538025, "learning_rate": 3.741999683379695e-05, "loss": 0.604, "step": 267000 }, { "epoch": 0.7562136734740033, "grad_norm": 1.9565626382827759, "learning_rate": 3.7396438775433277e-05, "loss": 0.6032, "step": 267500 }, { "epoch": 0.7576271569758237, "grad_norm": 1.9368705749511719, "learning_rate": 3.737288071706961e-05, "loss": 0.6042, "step": 268000 }, { "epoch": 0.7590406404776443, "grad_norm": 1.9383379220962524, "learning_rate": 3.734932265870593e-05, "loss": 0.6011, "step": 268500 }, { "epoch": 0.7604541239794649, "grad_norm": 1.9622483253479004, "learning_rate": 3.7325764600342254e-05, "loss": 0.605, "step": 269000 }, { "epoch": 0.7618676074812855, "grad_norm": 1.7892271280288696, "learning_rate": 3.730220654197858e-05, "loss": 0.5993, "step": 269500 }, { "epoch": 0.763281090983106, "grad_norm": 2.6887857913970947, "learning_rate": 3.72786484836149e-05, "loss": 0.6029, "step": 270000 }, { "epoch": 0.7646945744849266, "grad_norm": 1.812602162361145, "learning_rate": 3.7255090425251225e-05, "loss": 0.5993, "step": 270500 }, { "epoch": 0.7661080579867472, "grad_norm": 1.8653610944747925, "learning_rate": 3.723153236688755e-05, "loss": 0.6026, "step": 271000 }, { "epoch": 0.7675215414885678, "grad_norm": 1.954752802848816, "learning_rate": 3.720797430852387e-05, "loss": 0.6056, "step": 271500 }, { "epoch": 0.7689350249903883, "grad_norm": 1.7160903215408325, "learning_rate": 3.7184416250160196e-05, "loss": 0.6071, "step": 272000 }, { "epoch": 0.7703485084922089, "grad_norm": 1.7992541790008545, "learning_rate": 3.716085819179652e-05, "loss": 0.5977, "step": 272500 }, { "epoch": 0.7717619919940294, "grad_norm": 1.8806262016296387, "learning_rate": 3.7137300133432844e-05, "loss": 0.6022, "step": 273000 }, { "epoch": 0.77317547549585, "grad_norm": 1.9988188743591309, "learning_rate": 3.711374207506917e-05, "loss": 0.6058, "step": 273500 }, { "epoch": 0.7745889589976706, "grad_norm": 1.9410970211029053, "learning_rate": 3.709018401670549e-05, "loss": 0.5981, "step": 274000 }, { "epoch": 0.7760024424994911, "grad_norm": 1.895379900932312, "learning_rate": 3.7066625958341815e-05, "loss": 0.6019, "step": 274500 }, { "epoch": 0.7774159260013117, "grad_norm": 1.903684377670288, "learning_rate": 3.704306789997814e-05, "loss": 0.6019, "step": 275000 }, { "epoch": 0.7788294095031323, "grad_norm": 1.7529374361038208, "learning_rate": 3.701950984161446e-05, "loss": 0.6011, "step": 275500 }, { "epoch": 0.7802428930049529, "grad_norm": 1.8871148824691772, "learning_rate": 3.6995951783250786e-05, "loss": 0.6019, "step": 276000 }, { "epoch": 0.7816563765067734, "grad_norm": 2.0056920051574707, "learning_rate": 3.697239372488711e-05, "loss": 0.5966, "step": 276500 }, { "epoch": 0.783069860008594, "grad_norm": 1.979400396347046, "learning_rate": 3.694883566652343e-05, "loss": 0.6007, "step": 277000 }, { "epoch": 0.7844833435104146, "grad_norm": 1.8415111303329468, "learning_rate": 3.692527760815976e-05, "loss": 0.6008, "step": 277500 }, { "epoch": 0.7858968270122351, "grad_norm": 2.060133457183838, "learning_rate": 3.690171954979608e-05, "loss": 0.5978, "step": 278000 }, { "epoch": 0.7873103105140556, "grad_norm": 2.2376132011413574, "learning_rate": 3.6878161491432404e-05, "loss": 0.5995, "step": 278500 }, { "epoch": 0.7887237940158762, "grad_norm": 1.9143754243850708, "learning_rate": 3.685460343306873e-05, "loss": 0.5988, "step": 279000 }, { "epoch": 0.7901372775176968, "grad_norm": 1.792958378791809, "learning_rate": 3.683104537470505e-05, "loss": 0.6007, "step": 279500 }, { "epoch": 0.7915507610195174, "grad_norm": 1.9053369760513306, "learning_rate": 3.6807487316341375e-05, "loss": 0.6038, "step": 280000 }, { "epoch": 0.792964244521338, "grad_norm": 1.7971118688583374, "learning_rate": 3.67839292579777e-05, "loss": 0.5947, "step": 280500 }, { "epoch": 0.7943777280231585, "grad_norm": 1.791193962097168, "learning_rate": 3.676037119961402e-05, "loss": 0.6037, "step": 281000 }, { "epoch": 0.7957912115249791, "grad_norm": 1.9802024364471436, "learning_rate": 3.6736813141250346e-05, "loss": 0.5978, "step": 281500 }, { "epoch": 0.7972046950267997, "grad_norm": 1.7685120105743408, "learning_rate": 3.671325508288667e-05, "loss": 0.5978, "step": 282000 }, { "epoch": 0.7986181785286202, "grad_norm": 1.8075288534164429, "learning_rate": 3.6689697024523e-05, "loss": 0.5982, "step": 282500 }, { "epoch": 0.8000316620304407, "grad_norm": 1.8952981233596802, "learning_rate": 3.6666138966159324e-05, "loss": 0.5979, "step": 283000 }, { "epoch": 0.8014451455322613, "grad_norm": 1.7745672464370728, "learning_rate": 3.664258090779565e-05, "loss": 0.6008, "step": 283500 }, { "epoch": 0.8028586290340819, "grad_norm": 1.9894499778747559, "learning_rate": 3.661902284943197e-05, "loss": 0.6035, "step": 284000 }, { "epoch": 0.8042721125359025, "grad_norm": 1.95890212059021, "learning_rate": 3.6595464791068295e-05, "loss": 0.6033, "step": 284500 }, { "epoch": 0.805685596037723, "grad_norm": 1.8268929719924927, "learning_rate": 3.657190673270462e-05, "loss": 0.5962, "step": 285000 }, { "epoch": 0.8070990795395436, "grad_norm": 1.889316439628601, "learning_rate": 3.654834867434094e-05, "loss": 0.6002, "step": 285500 }, { "epoch": 0.8085125630413642, "grad_norm": 1.7330410480499268, "learning_rate": 3.6524790615977266e-05, "loss": 0.5979, "step": 286000 }, { "epoch": 0.8099260465431848, "grad_norm": 1.8749533891677856, "learning_rate": 3.650123255761359e-05, "loss": 0.5959, "step": 286500 }, { "epoch": 0.8113395300450054, "grad_norm": 1.7867050170898438, "learning_rate": 3.647767449924991e-05, "loss": 0.5984, "step": 287000 }, { "epoch": 0.8127530135468258, "grad_norm": 2.014152765274048, "learning_rate": 3.645411644088624e-05, "loss": 0.5994, "step": 287500 }, { "epoch": 0.8141664970486464, "grad_norm": 1.953102469444275, "learning_rate": 3.643055838252256e-05, "loss": 0.5971, "step": 288000 }, { "epoch": 0.815579980550467, "grad_norm": 1.6864335536956787, "learning_rate": 3.6407000324158884e-05, "loss": 0.5953, "step": 288500 }, { "epoch": 0.8169934640522876, "grad_norm": 2.0586600303649902, "learning_rate": 3.638344226579521e-05, "loss": 0.5961, "step": 289000 }, { "epoch": 0.8184069475541081, "grad_norm": 1.9390426874160767, "learning_rate": 3.635988420743153e-05, "loss": 0.5975, "step": 289500 }, { "epoch": 0.8198204310559287, "grad_norm": 1.9605486392974854, "learning_rate": 3.6336326149067855e-05, "loss": 0.5969, "step": 290000 }, { "epoch": 0.8212339145577493, "grad_norm": 1.872693657875061, "learning_rate": 3.631276809070418e-05, "loss": 0.5988, "step": 290500 }, { "epoch": 0.8226473980595699, "grad_norm": 1.9217848777770996, "learning_rate": 3.62892100323405e-05, "loss": 0.594, "step": 291000 }, { "epoch": 0.8240608815613905, "grad_norm": 1.874862551689148, "learning_rate": 3.6265651973976826e-05, "loss": 0.5939, "step": 291500 }, { "epoch": 0.825474365063211, "grad_norm": 1.731613278388977, "learning_rate": 3.624209391561315e-05, "loss": 0.5984, "step": 292000 }, { "epoch": 0.8268878485650315, "grad_norm": 1.8584221601486206, "learning_rate": 3.6218535857249474e-05, "loss": 0.5931, "step": 292500 }, { "epoch": 0.8283013320668521, "grad_norm": 1.9315375089645386, "learning_rate": 3.61949777988858e-05, "loss": 0.5982, "step": 293000 }, { "epoch": 0.8297148155686727, "grad_norm": 2.0657551288604736, "learning_rate": 3.617141974052212e-05, "loss": 0.5951, "step": 293500 }, { "epoch": 0.8311282990704932, "grad_norm": 1.811474084854126, "learning_rate": 3.6147861682158445e-05, "loss": 0.6012, "step": 294000 }, { "epoch": 0.8325417825723138, "grad_norm": 1.8847694396972656, "learning_rate": 3.612430362379477e-05, "loss": 0.5882, "step": 294500 }, { "epoch": 0.8339552660741344, "grad_norm": 1.9133442640304565, "learning_rate": 3.610074556543109e-05, "loss": 0.5983, "step": 295000 }, { "epoch": 0.835368749575955, "grad_norm": 1.8558775186538696, "learning_rate": 3.6077187507067416e-05, "loss": 0.5921, "step": 295500 }, { "epoch": 0.8367822330777756, "grad_norm": 1.9489132165908813, "learning_rate": 3.605362944870374e-05, "loss": 0.5937, "step": 296000 }, { "epoch": 0.8381957165795961, "grad_norm": 1.9695388078689575, "learning_rate": 3.603007139034006e-05, "loss": 0.5967, "step": 296500 }, { "epoch": 0.8396092000814166, "grad_norm": 1.9106175899505615, "learning_rate": 3.6006513331976394e-05, "loss": 0.5958, "step": 297000 }, { "epoch": 0.8410226835832372, "grad_norm": 1.8081308603286743, "learning_rate": 3.598295527361272e-05, "loss": 0.593, "step": 297500 }, { "epoch": 0.8424361670850578, "grad_norm": 1.8449726104736328, "learning_rate": 3.595939721524904e-05, "loss": 0.5935, "step": 298000 }, { "epoch": 0.8438496505868783, "grad_norm": 1.833025574684143, "learning_rate": 3.5935839156885365e-05, "loss": 0.5904, "step": 298500 }, { "epoch": 0.8452631340886989, "grad_norm": 2.01021146774292, "learning_rate": 3.591228109852169e-05, "loss": 0.5932, "step": 299000 }, { "epoch": 0.8466766175905195, "grad_norm": 1.7380565404891968, "learning_rate": 3.588872304015801e-05, "loss": 0.5903, "step": 299500 }, { "epoch": 0.8480901010923401, "grad_norm": 2.0876429080963135, "learning_rate": 3.5865164981794336e-05, "loss": 0.5938, "step": 300000 }, { "epoch": 0.8495035845941606, "grad_norm": 1.8812239170074463, "learning_rate": 3.584160692343066e-05, "loss": 0.5893, "step": 300500 }, { "epoch": 0.8509170680959812, "grad_norm": 1.7818992137908936, "learning_rate": 3.581804886506698e-05, "loss": 0.5924, "step": 301000 }, { "epoch": 0.8523305515978018, "grad_norm": 2.005600690841675, "learning_rate": 3.579449080670331e-05, "loss": 0.5891, "step": 301500 }, { "epoch": 0.8537440350996223, "grad_norm": 1.8553566932678223, "learning_rate": 3.577093274833963e-05, "loss": 0.5896, "step": 302000 }, { "epoch": 0.8551575186014428, "grad_norm": 1.9020817279815674, "learning_rate": 3.5747374689975954e-05, "loss": 0.5928, "step": 302500 }, { "epoch": 0.8565710021032634, "grad_norm": 2.0885257720947266, "learning_rate": 3.572381663161228e-05, "loss": 0.5902, "step": 303000 }, { "epoch": 0.857984485605084, "grad_norm": 1.6824073791503906, "learning_rate": 3.57002585732486e-05, "loss": 0.5917, "step": 303500 }, { "epoch": 0.8593979691069046, "grad_norm": 1.8991578817367554, "learning_rate": 3.5676700514884925e-05, "loss": 0.5903, "step": 304000 }, { "epoch": 0.8608114526087252, "grad_norm": 1.8822497129440308, "learning_rate": 3.565314245652125e-05, "loss": 0.5977, "step": 304500 }, { "epoch": 0.8622249361105457, "grad_norm": 2.0223093032836914, "learning_rate": 3.562958439815757e-05, "loss": 0.5868, "step": 305000 }, { "epoch": 0.8636384196123663, "grad_norm": 1.8350187540054321, "learning_rate": 3.5606026339793896e-05, "loss": 0.5937, "step": 305500 }, { "epoch": 0.8650519031141869, "grad_norm": 1.9229165315628052, "learning_rate": 3.558246828143022e-05, "loss": 0.5887, "step": 306000 }, { "epoch": 0.8664653866160075, "grad_norm": 1.9054909944534302, "learning_rate": 3.5558910223066543e-05, "loss": 0.5889, "step": 306500 }, { "epoch": 0.8678788701178279, "grad_norm": 1.9834719896316528, "learning_rate": 3.553535216470287e-05, "loss": 0.5891, "step": 307000 }, { "epoch": 0.8692923536196485, "grad_norm": 1.9734822511672974, "learning_rate": 3.551179410633919e-05, "loss": 0.5917, "step": 307500 }, { "epoch": 0.8707058371214691, "grad_norm": 2.0296788215637207, "learning_rate": 3.5488236047975514e-05, "loss": 0.5939, "step": 308000 }, { "epoch": 0.8721193206232897, "grad_norm": 1.9160022735595703, "learning_rate": 3.546467798961184e-05, "loss": 0.5915, "step": 308500 }, { "epoch": 0.8735328041251103, "grad_norm": 1.8095922470092773, "learning_rate": 3.544111993124816e-05, "loss": 0.5906, "step": 309000 }, { "epoch": 0.8749462876269308, "grad_norm": 2.1554205417633057, "learning_rate": 3.5417561872884485e-05, "loss": 0.5887, "step": 309500 }, { "epoch": 0.8763597711287514, "grad_norm": 1.8364665508270264, "learning_rate": 3.539400381452081e-05, "loss": 0.5883, "step": 310000 }, { "epoch": 0.877773254630572, "grad_norm": 1.9245195388793945, "learning_rate": 3.537044575615713e-05, "loss": 0.5881, "step": 310500 }, { "epoch": 0.8791867381323926, "grad_norm": 1.8894625902175903, "learning_rate": 3.5346887697793456e-05, "loss": 0.5892, "step": 311000 }, { "epoch": 0.880600221634213, "grad_norm": 1.9776558876037598, "learning_rate": 3.532332963942979e-05, "loss": 0.5867, "step": 311500 }, { "epoch": 0.8820137051360336, "grad_norm": 1.949259638786316, "learning_rate": 3.529977158106611e-05, "loss": 0.5868, "step": 312000 }, { "epoch": 0.8834271886378542, "grad_norm": 1.798855185508728, "learning_rate": 3.5276213522702434e-05, "loss": 0.5907, "step": 312500 }, { "epoch": 0.8848406721396748, "grad_norm": 1.8175495862960815, "learning_rate": 3.525265546433876e-05, "loss": 0.5857, "step": 313000 }, { "epoch": 0.8862541556414953, "grad_norm": 1.978953242301941, "learning_rate": 3.522909740597508e-05, "loss": 0.5843, "step": 313500 }, { "epoch": 0.8876676391433159, "grad_norm": 1.7025822401046753, "learning_rate": 3.5205539347611405e-05, "loss": 0.5875, "step": 314000 }, { "epoch": 0.8890811226451365, "grad_norm": 1.9008023738861084, "learning_rate": 3.518198128924773e-05, "loss": 0.5904, "step": 314500 }, { "epoch": 0.8904946061469571, "grad_norm": 1.95393967628479, "learning_rate": 3.515842323088405e-05, "loss": 0.5871, "step": 315000 }, { "epoch": 0.8919080896487777, "grad_norm": 1.7087043523788452, "learning_rate": 3.5134865172520376e-05, "loss": 0.59, "step": 315500 }, { "epoch": 0.8933215731505982, "grad_norm": 1.9665342569351196, "learning_rate": 3.51113071141567e-05, "loss": 0.5939, "step": 316000 }, { "epoch": 0.8947350566524187, "grad_norm": 1.8828564882278442, "learning_rate": 3.5087749055793024e-05, "loss": 0.5867, "step": 316500 }, { "epoch": 0.8961485401542393, "grad_norm": 1.8067898750305176, "learning_rate": 3.506419099742935e-05, "loss": 0.5861, "step": 317000 }, { "epoch": 0.8975620236560599, "grad_norm": 1.7631852626800537, "learning_rate": 3.504063293906567e-05, "loss": 0.5894, "step": 317500 }, { "epoch": 0.8989755071578804, "grad_norm": 1.799294114112854, "learning_rate": 3.5017074880701995e-05, "loss": 0.5831, "step": 318000 }, { "epoch": 0.900388990659701, "grad_norm": 1.8449277877807617, "learning_rate": 3.499351682233832e-05, "loss": 0.5836, "step": 318500 }, { "epoch": 0.9018024741615216, "grad_norm": 1.949988603591919, "learning_rate": 3.496995876397464e-05, "loss": 0.5902, "step": 319000 }, { "epoch": 0.9032159576633422, "grad_norm": 1.93195641040802, "learning_rate": 3.4946400705610966e-05, "loss": 0.5876, "step": 319500 }, { "epoch": 0.9046294411651628, "grad_norm": 1.991378903388977, "learning_rate": 3.492284264724729e-05, "loss": 0.5828, "step": 320000 }, { "epoch": 0.9060429246669833, "grad_norm": 1.9340568780899048, "learning_rate": 3.489928458888361e-05, "loss": 0.587, "step": 320500 }, { "epoch": 0.9074564081688039, "grad_norm": 1.840930461883545, "learning_rate": 3.487572653051994e-05, "loss": 0.5871, "step": 321000 }, { "epoch": 0.9088698916706244, "grad_norm": 1.7890210151672363, "learning_rate": 3.485216847215626e-05, "loss": 0.5876, "step": 321500 }, { "epoch": 0.910283375172445, "grad_norm": 1.962910771369934, "learning_rate": 3.4828610413792584e-05, "loss": 0.5865, "step": 322000 }, { "epoch": 0.9116968586742655, "grad_norm": 2.023253917694092, "learning_rate": 3.480505235542891e-05, "loss": 0.5913, "step": 322500 }, { "epoch": 0.9131103421760861, "grad_norm": 1.835400938987732, "learning_rate": 3.478149429706523e-05, "loss": 0.5859, "step": 323000 }, { "epoch": 0.9145238256779067, "grad_norm": 2.0387985706329346, "learning_rate": 3.4757936238701555e-05, "loss": 0.5838, "step": 323500 }, { "epoch": 0.9159373091797273, "grad_norm": 1.8406462669372559, "learning_rate": 3.473437818033788e-05, "loss": 0.5866, "step": 324000 }, { "epoch": 0.9173507926815478, "grad_norm": 1.8033616542816162, "learning_rate": 3.47108201219742e-05, "loss": 0.5839, "step": 324500 }, { "epoch": 0.9187642761833684, "grad_norm": 2.035203456878662, "learning_rate": 3.4687262063610526e-05, "loss": 0.5903, "step": 325000 }, { "epoch": 0.920177759685189, "grad_norm": 2.0339694023132324, "learning_rate": 3.466370400524685e-05, "loss": 0.5828, "step": 325500 }, { "epoch": 0.9215912431870095, "grad_norm": 1.9567062854766846, "learning_rate": 3.464014594688318e-05, "loss": 0.5866, "step": 326000 }, { "epoch": 0.92300472668883, "grad_norm": 1.9255435466766357, "learning_rate": 3.4616587888519504e-05, "loss": 0.581, "step": 326500 }, { "epoch": 0.9244182101906506, "grad_norm": 1.764859676361084, "learning_rate": 3.459302983015583e-05, "loss": 0.5852, "step": 327000 }, { "epoch": 0.9258316936924712, "grad_norm": 2.0059189796447754, "learning_rate": 3.456947177179215e-05, "loss": 0.5813, "step": 327500 }, { "epoch": 0.9272451771942918, "grad_norm": 2.036370038986206, "learning_rate": 3.4545913713428475e-05, "loss": 0.5822, "step": 328000 }, { "epoch": 0.9286586606961124, "grad_norm": 1.8872077465057373, "learning_rate": 3.45223556550648e-05, "loss": 0.5853, "step": 328500 }, { "epoch": 0.9300721441979329, "grad_norm": 1.8803049325942993, "learning_rate": 3.449879759670112e-05, "loss": 0.5848, "step": 329000 }, { "epoch": 0.9314856276997535, "grad_norm": 1.7240315675735474, "learning_rate": 3.4475239538337446e-05, "loss": 0.579, "step": 329500 }, { "epoch": 0.9328991112015741, "grad_norm": 1.8084460496902466, "learning_rate": 3.445168147997377e-05, "loss": 0.5858, "step": 330000 }, { "epoch": 0.9343125947033947, "grad_norm": 1.8493149280548096, "learning_rate": 3.442812342161009e-05, "loss": 0.5812, "step": 330500 }, { "epoch": 0.9357260782052151, "grad_norm": 1.8838059902191162, "learning_rate": 3.440456536324642e-05, "loss": 0.5882, "step": 331000 }, { "epoch": 0.9371395617070357, "grad_norm": 1.8747752904891968, "learning_rate": 3.438100730488274e-05, "loss": 0.5859, "step": 331500 }, { "epoch": 0.9385530452088563, "grad_norm": 1.8552840948104858, "learning_rate": 3.4357449246519064e-05, "loss": 0.5847, "step": 332000 }, { "epoch": 0.9399665287106769, "grad_norm": 1.927878975868225, "learning_rate": 3.433389118815539e-05, "loss": 0.5826, "step": 332500 }, { "epoch": 0.9413800122124975, "grad_norm": 1.8824865818023682, "learning_rate": 3.431033312979171e-05, "loss": 0.5851, "step": 333000 }, { "epoch": 0.942793495714318, "grad_norm": 1.7936691045761108, "learning_rate": 3.4286775071428035e-05, "loss": 0.5844, "step": 333500 }, { "epoch": 0.9442069792161386, "grad_norm": 1.711824893951416, "learning_rate": 3.426321701306436e-05, "loss": 0.5826, "step": 334000 }, { "epoch": 0.9456204627179592, "grad_norm": 1.8184577226638794, "learning_rate": 3.423965895470068e-05, "loss": 0.584, "step": 334500 }, { "epoch": 0.9470339462197798, "grad_norm": 1.7772451639175415, "learning_rate": 3.4216100896337006e-05, "loss": 0.5872, "step": 335000 }, { "epoch": 0.9484474297216003, "grad_norm": 1.7819527387619019, "learning_rate": 3.419254283797333e-05, "loss": 0.5811, "step": 335500 }, { "epoch": 0.9498609132234208, "grad_norm": 1.8568309545516968, "learning_rate": 3.4168984779609654e-05, "loss": 0.5829, "step": 336000 }, { "epoch": 0.9512743967252414, "grad_norm": 1.9460350275039673, "learning_rate": 3.414542672124598e-05, "loss": 0.583, "step": 336500 }, { "epoch": 0.952687880227062, "grad_norm": 1.8824281692504883, "learning_rate": 3.41218686628823e-05, "loss": 0.58, "step": 337000 }, { "epoch": 0.9541013637288825, "grad_norm": 1.9121915102005005, "learning_rate": 3.4098310604518625e-05, "loss": 0.584, "step": 337500 }, { "epoch": 0.9555148472307031, "grad_norm": 2.055818557739258, "learning_rate": 3.407475254615495e-05, "loss": 0.5827, "step": 338000 }, { "epoch": 0.9569283307325237, "grad_norm": 1.8675333261489868, "learning_rate": 3.405119448779127e-05, "loss": 0.5805, "step": 338500 }, { "epoch": 0.9583418142343443, "grad_norm": 1.9681202173233032, "learning_rate": 3.4027636429427596e-05, "loss": 0.5836, "step": 339000 }, { "epoch": 0.9597552977361649, "grad_norm": 1.9255595207214355, "learning_rate": 3.400407837106392e-05, "loss": 0.5769, "step": 339500 }, { "epoch": 0.9611687812379854, "grad_norm": 1.9721808433532715, "learning_rate": 3.398052031270024e-05, "loss": 0.5843, "step": 340000 }, { "epoch": 0.9625822647398059, "grad_norm": 1.7663261890411377, "learning_rate": 3.3956962254336574e-05, "loss": 0.5788, "step": 340500 }, { "epoch": 0.9639957482416265, "grad_norm": 1.7629916667938232, "learning_rate": 3.39334041959729e-05, "loss": 0.5797, "step": 341000 }, { "epoch": 0.9654092317434471, "grad_norm": 1.8277138471603394, "learning_rate": 3.390984613760922e-05, "loss": 0.5798, "step": 341500 }, { "epoch": 0.9668227152452676, "grad_norm": 1.8738274574279785, "learning_rate": 3.3886288079245545e-05, "loss": 0.5793, "step": 342000 }, { "epoch": 0.9682361987470882, "grad_norm": 1.7419294118881226, "learning_rate": 3.386273002088187e-05, "loss": 0.5834, "step": 342500 }, { "epoch": 0.9696496822489088, "grad_norm": 1.8020451068878174, "learning_rate": 3.383917196251819e-05, "loss": 0.5797, "step": 343000 }, { "epoch": 0.9710631657507294, "grad_norm": 1.837862253189087, "learning_rate": 3.3815613904154516e-05, "loss": 0.5759, "step": 343500 }, { "epoch": 0.97247664925255, "grad_norm": 1.903087854385376, "learning_rate": 3.379205584579084e-05, "loss": 0.5802, "step": 344000 }, { "epoch": 0.9738901327543705, "grad_norm": 1.927441954612732, "learning_rate": 3.376849778742716e-05, "loss": 0.5789, "step": 344500 }, { "epoch": 0.9753036162561911, "grad_norm": 1.8367253541946411, "learning_rate": 3.374493972906349e-05, "loss": 0.5831, "step": 345000 }, { "epoch": 0.9767170997580116, "grad_norm": 1.9271984100341797, "learning_rate": 3.372138167069981e-05, "loss": 0.5773, "step": 345500 }, { "epoch": 0.9781305832598322, "grad_norm": 1.8781111240386963, "learning_rate": 3.3697823612336134e-05, "loss": 0.5788, "step": 346000 }, { "epoch": 0.9795440667616527, "grad_norm": 1.8772222995758057, "learning_rate": 3.367426555397246e-05, "loss": 0.5831, "step": 346500 }, { "epoch": 0.9809575502634733, "grad_norm": 1.94069504737854, "learning_rate": 3.365070749560878e-05, "loss": 0.5795, "step": 347000 }, { "epoch": 0.9823710337652939, "grad_norm": 1.982595682144165, "learning_rate": 3.3627149437245105e-05, "loss": 0.5817, "step": 347500 }, { "epoch": 0.9837845172671145, "grad_norm": 2.076514959335327, "learning_rate": 3.360359137888143e-05, "loss": 0.5778, "step": 348000 }, { "epoch": 0.985198000768935, "grad_norm": 1.8249603509902954, "learning_rate": 3.358003332051775e-05, "loss": 0.5802, "step": 348500 }, { "epoch": 0.9866114842707556, "grad_norm": 1.7937895059585571, "learning_rate": 3.3556475262154076e-05, "loss": 0.5815, "step": 349000 }, { "epoch": 0.9880249677725762, "grad_norm": 1.7945635318756104, "learning_rate": 3.35329172037904e-05, "loss": 0.5782, "step": 349500 }, { "epoch": 0.9894384512743968, "grad_norm": 1.9148646593093872, "learning_rate": 3.3509359145426723e-05, "loss": 0.5766, "step": 350000 }, { "epoch": 0.9908519347762172, "grad_norm": 1.8325837850570679, "learning_rate": 3.348580108706305e-05, "loss": 0.5798, "step": 350500 }, { "epoch": 0.9922654182780378, "grad_norm": 1.9027276039123535, "learning_rate": 3.346224302869937e-05, "loss": 0.58, "step": 351000 }, { "epoch": 0.9936789017798584, "grad_norm": 2.0603599548339844, "learning_rate": 3.3438684970335694e-05, "loss": 0.5785, "step": 351500 }, { "epoch": 0.995092385281679, "grad_norm": 2.010258913040161, "learning_rate": 3.341512691197202e-05, "loss": 0.5769, "step": 352000 }, { "epoch": 0.9965058687834996, "grad_norm": 1.7668412923812866, "learning_rate": 3.339156885360834e-05, "loss": 0.5775, "step": 352500 }, { "epoch": 0.9979193522853201, "grad_norm": 1.9002536535263062, "learning_rate": 3.3368010795244665e-05, "loss": 0.578, "step": 353000 }, { "epoch": 0.9993328357871407, "grad_norm": 1.7180805206298828, "learning_rate": 3.334445273688099e-05, "loss": 0.5762, "step": 353500 }, { "epoch": 1.0007463192889612, "grad_norm": 1.7328078746795654, "learning_rate": 3.332089467851731e-05, "loss": 0.5759, "step": 354000 }, { "epoch": 1.0021598027907819, "grad_norm": 1.9177926778793335, "learning_rate": 3.3297336620153636e-05, "loss": 0.5733, "step": 354500 }, { "epoch": 1.0035732862926023, "grad_norm": 1.9339911937713623, "learning_rate": 3.327377856178997e-05, "loss": 0.5774, "step": 355000 }, { "epoch": 1.004986769794423, "grad_norm": 1.8924944400787354, "learning_rate": 3.325022050342629e-05, "loss": 0.577, "step": 355500 }, { "epoch": 1.0064002532962435, "grad_norm": 1.880789875984192, "learning_rate": 3.3226662445062614e-05, "loss": 0.5784, "step": 356000 }, { "epoch": 1.0078137367980642, "grad_norm": 1.8468314409255981, "learning_rate": 3.320310438669894e-05, "loss": 0.572, "step": 356500 }, { "epoch": 1.0092272202998847, "grad_norm": 1.9924639463424683, "learning_rate": 3.317954632833526e-05, "loss": 0.5754, "step": 357000 }, { "epoch": 1.0106407038017051, "grad_norm": 2.020561933517456, "learning_rate": 3.3155988269971585e-05, "loss": 0.5804, "step": 357500 }, { "epoch": 1.0120541873035258, "grad_norm": 1.8901152610778809, "learning_rate": 3.313243021160791e-05, "loss": 0.5773, "step": 358000 }, { "epoch": 1.0134676708053463, "grad_norm": 1.7746001482009888, "learning_rate": 3.310887215324423e-05, "loss": 0.576, "step": 358500 }, { "epoch": 1.014881154307167, "grad_norm": 1.887457251548767, "learning_rate": 3.3085314094880556e-05, "loss": 0.5778, "step": 359000 }, { "epoch": 1.0162946378089874, "grad_norm": 1.6673171520233154, "learning_rate": 3.306175603651688e-05, "loss": 0.5736, "step": 359500 }, { "epoch": 1.0177081213108081, "grad_norm": 2.64837646484375, "learning_rate": 3.3038197978153204e-05, "loss": 0.5772, "step": 360000 }, { "epoch": 1.0191216048126286, "grad_norm": 1.7316200733184814, "learning_rate": 3.301463991978953e-05, "loss": 0.5746, "step": 360500 }, { "epoch": 1.0205350883144493, "grad_norm": 1.9742236137390137, "learning_rate": 3.299108186142585e-05, "loss": 0.5788, "step": 361000 }, { "epoch": 1.0219485718162697, "grad_norm": 1.7663785219192505, "learning_rate": 3.2967523803062175e-05, "loss": 0.5733, "step": 361500 }, { "epoch": 1.0233620553180904, "grad_norm": 1.8429192304611206, "learning_rate": 3.29439657446985e-05, "loss": 0.5747, "step": 362000 }, { "epoch": 1.024775538819911, "grad_norm": 1.8249471187591553, "learning_rate": 3.292040768633482e-05, "loss": 0.5767, "step": 362500 }, { "epoch": 1.0261890223217314, "grad_norm": 1.8891184329986572, "learning_rate": 3.2896849627971146e-05, "loss": 0.5732, "step": 363000 }, { "epoch": 1.027602505823552, "grad_norm": 2.05218768119812, "learning_rate": 3.287329156960747e-05, "loss": 0.5721, "step": 363500 }, { "epoch": 1.0290159893253725, "grad_norm": 1.9796159267425537, "learning_rate": 3.284973351124379e-05, "loss": 0.5727, "step": 364000 }, { "epoch": 1.0304294728271932, "grad_norm": 1.976671576499939, "learning_rate": 3.282617545288012e-05, "loss": 0.5765, "step": 364500 }, { "epoch": 1.0318429563290137, "grad_norm": 1.8744670152664185, "learning_rate": 3.280261739451644e-05, "loss": 0.5717, "step": 365000 }, { "epoch": 1.0332564398308344, "grad_norm": 1.8055274486541748, "learning_rate": 3.2779059336152764e-05, "loss": 0.5713, "step": 365500 }, { "epoch": 1.0346699233326548, "grad_norm": 1.877522349357605, "learning_rate": 3.275550127778909e-05, "loss": 0.5761, "step": 366000 }, { "epoch": 1.0360834068344755, "grad_norm": 1.685530424118042, "learning_rate": 3.273194321942541e-05, "loss": 0.5743, "step": 366500 }, { "epoch": 1.037496890336296, "grad_norm": 2.1465983390808105, "learning_rate": 3.2708385161061735e-05, "loss": 0.5711, "step": 367000 }, { "epoch": 1.0389103738381165, "grad_norm": 1.8468915224075317, "learning_rate": 3.268482710269806e-05, "loss": 0.5711, "step": 367500 }, { "epoch": 1.0403238573399372, "grad_norm": 1.837902545928955, "learning_rate": 3.266126904433438e-05, "loss": 0.5735, "step": 368000 }, { "epoch": 1.0417373408417576, "grad_norm": 1.8069218397140503, "learning_rate": 3.2637710985970706e-05, "loss": 0.5728, "step": 368500 }, { "epoch": 1.0431508243435783, "grad_norm": 1.613204002380371, "learning_rate": 3.261415292760703e-05, "loss": 0.5755, "step": 369000 }, { "epoch": 1.0445643078453988, "grad_norm": 1.8371464014053345, "learning_rate": 3.259059486924336e-05, "loss": 0.5691, "step": 369500 }, { "epoch": 1.0459777913472195, "grad_norm": 1.9042716026306152, "learning_rate": 3.2567036810879684e-05, "loss": 0.5692, "step": 370000 }, { "epoch": 1.04739127484904, "grad_norm": 1.936032772064209, "learning_rate": 3.254347875251601e-05, "loss": 0.5692, "step": 370500 }, { "epoch": 1.0488047583508606, "grad_norm": 2.01570987701416, "learning_rate": 3.251992069415233e-05, "loss": 0.5729, "step": 371000 }, { "epoch": 1.050218241852681, "grad_norm": 1.8199867010116577, "learning_rate": 3.2496362635788655e-05, "loss": 0.5715, "step": 371500 }, { "epoch": 1.0516317253545016, "grad_norm": 2.062371253967285, "learning_rate": 3.247280457742498e-05, "loss": 0.5725, "step": 372000 }, { "epoch": 1.0530452088563222, "grad_norm": 1.739439606666565, "learning_rate": 3.24492465190613e-05, "loss": 0.5722, "step": 372500 }, { "epoch": 1.0544586923581427, "grad_norm": 1.936995029449463, "learning_rate": 3.2425688460697626e-05, "loss": 0.5706, "step": 373000 }, { "epoch": 1.0558721758599634, "grad_norm": 1.740294337272644, "learning_rate": 3.240213040233395e-05, "loss": 0.5694, "step": 373500 }, { "epoch": 1.0572856593617839, "grad_norm": 1.8271058797836304, "learning_rate": 3.237857234397027e-05, "loss": 0.5684, "step": 374000 }, { "epoch": 1.0586991428636046, "grad_norm": 1.9104254245758057, "learning_rate": 3.23550142856066e-05, "loss": 0.5691, "step": 374500 }, { "epoch": 1.060112626365425, "grad_norm": 1.9009777307510376, "learning_rate": 3.233145622724292e-05, "loss": 0.5695, "step": 375000 }, { "epoch": 1.0615261098672457, "grad_norm": 1.59981369972229, "learning_rate": 3.2307898168879244e-05, "loss": 0.5716, "step": 375500 }, { "epoch": 1.0629395933690662, "grad_norm": 1.9367072582244873, "learning_rate": 3.228434011051557e-05, "loss": 0.5694, "step": 376000 }, { "epoch": 1.0643530768708866, "grad_norm": 1.8805826902389526, "learning_rate": 3.226078205215189e-05, "loss": 0.5675, "step": 376500 }, { "epoch": 1.0657665603727073, "grad_norm": 1.7523210048675537, "learning_rate": 3.2237223993788215e-05, "loss": 0.5723, "step": 377000 }, { "epoch": 1.0671800438745278, "grad_norm": 1.8737441301345825, "learning_rate": 3.221366593542454e-05, "loss": 0.5704, "step": 377500 }, { "epoch": 1.0685935273763485, "grad_norm": 1.825208306312561, "learning_rate": 3.219010787706086e-05, "loss": 0.5702, "step": 378000 }, { "epoch": 1.070007010878169, "grad_norm": 1.8157204389572144, "learning_rate": 3.2166549818697186e-05, "loss": 0.5708, "step": 378500 }, { "epoch": 1.0714204943799897, "grad_norm": 1.7042979001998901, "learning_rate": 3.214299176033351e-05, "loss": 0.5701, "step": 379000 }, { "epoch": 1.0728339778818101, "grad_norm": 1.598936676979065, "learning_rate": 3.2119433701969834e-05, "loss": 0.5734, "step": 379500 }, { "epoch": 1.0742474613836308, "grad_norm": 1.8905622959136963, "learning_rate": 3.209587564360616e-05, "loss": 0.5697, "step": 380000 }, { "epoch": 1.0756609448854513, "grad_norm": 1.9121912717819214, "learning_rate": 3.207231758524248e-05, "loss": 0.5676, "step": 380500 }, { "epoch": 1.077074428387272, "grad_norm": 1.776739239692688, "learning_rate": 3.2048759526878805e-05, "loss": 0.5725, "step": 381000 }, { "epoch": 1.0784879118890924, "grad_norm": 1.8465319871902466, "learning_rate": 3.202520146851513e-05, "loss": 0.5636, "step": 381500 }, { "epoch": 1.079901395390913, "grad_norm": 2.045461416244507, "learning_rate": 3.200164341015145e-05, "loss": 0.5753, "step": 382000 }, { "epoch": 1.0813148788927336, "grad_norm": 1.9823592901229858, "learning_rate": 3.1978085351787776e-05, "loss": 0.5664, "step": 382500 }, { "epoch": 1.082728362394554, "grad_norm": 1.7886230945587158, "learning_rate": 3.19545272934241e-05, "loss": 0.5677, "step": 383000 }, { "epoch": 1.0841418458963747, "grad_norm": 2.094017267227173, "learning_rate": 3.193096923506042e-05, "loss": 0.5685, "step": 383500 }, { "epoch": 1.0855553293981952, "grad_norm": 2.120039701461792, "learning_rate": 3.190741117669675e-05, "loss": 0.5738, "step": 384000 }, { "epoch": 1.086968812900016, "grad_norm": 2.050354242324829, "learning_rate": 3.188385311833307e-05, "loss": 0.5686, "step": 384500 }, { "epoch": 1.0883822964018364, "grad_norm": 1.9706401824951172, "learning_rate": 3.1860295059969394e-05, "loss": 0.5674, "step": 385000 }, { "epoch": 1.089795779903657, "grad_norm": 1.876920223236084, "learning_rate": 3.183673700160572e-05, "loss": 0.5703, "step": 385500 }, { "epoch": 1.0912092634054775, "grad_norm": 1.8551950454711914, "learning_rate": 3.181317894324204e-05, "loss": 0.5669, "step": 386000 }, { "epoch": 1.0926227469072982, "grad_norm": 1.8999260663986206, "learning_rate": 3.1789620884878365e-05, "loss": 0.565, "step": 386500 }, { "epoch": 1.0940362304091187, "grad_norm": 1.7998162508010864, "learning_rate": 3.176606282651469e-05, "loss": 0.5697, "step": 387000 }, { "epoch": 1.0954497139109391, "grad_norm": 1.8401626348495483, "learning_rate": 3.174250476815101e-05, "loss": 0.5677, "step": 387500 }, { "epoch": 1.0968631974127598, "grad_norm": 1.9232958555221558, "learning_rate": 3.1718946709787336e-05, "loss": 0.5675, "step": 388000 }, { "epoch": 1.0982766809145803, "grad_norm": 1.6265684366226196, "learning_rate": 3.169538865142366e-05, "loss": 0.5644, "step": 388500 }, { "epoch": 1.099690164416401, "grad_norm": 2.015392541885376, "learning_rate": 3.1671830593059984e-05, "loss": 0.5643, "step": 389000 }, { "epoch": 1.1011036479182215, "grad_norm": 1.8779159784317017, "learning_rate": 3.164827253469631e-05, "loss": 0.5715, "step": 389500 }, { "epoch": 1.1025171314200422, "grad_norm": 1.835094928741455, "learning_rate": 3.162471447633263e-05, "loss": 0.5682, "step": 390000 }, { "epoch": 1.1039306149218626, "grad_norm": 1.7827551364898682, "learning_rate": 3.1601156417968955e-05, "loss": 0.564, "step": 390500 }, { "epoch": 1.105344098423683, "grad_norm": 1.812569499015808, "learning_rate": 3.157759835960528e-05, "loss": 0.5667, "step": 391000 }, { "epoch": 1.1067575819255038, "grad_norm": 1.7027009725570679, "learning_rate": 3.15540403012416e-05, "loss": 0.5645, "step": 391500 }, { "epoch": 1.1081710654273242, "grad_norm": 1.7990082502365112, "learning_rate": 3.1530482242877926e-05, "loss": 0.5666, "step": 392000 }, { "epoch": 1.109584548929145, "grad_norm": 1.8616704940795898, "learning_rate": 3.150692418451425e-05, "loss": 0.5658, "step": 392500 }, { "epoch": 1.1109980324309654, "grad_norm": 1.9962800741195679, "learning_rate": 3.148336612615058e-05, "loss": 0.565, "step": 393000 }, { "epoch": 1.112411515932786, "grad_norm": 1.9534193277359009, "learning_rate": 3.1459808067786903e-05, "loss": 0.5661, "step": 393500 }, { "epoch": 1.1138249994346066, "grad_norm": 2.0489165782928467, "learning_rate": 3.143625000942323e-05, "loss": 0.564, "step": 394000 }, { "epoch": 1.1152384829364272, "grad_norm": 1.8024005889892578, "learning_rate": 3.141269195105955e-05, "loss": 0.5662, "step": 394500 }, { "epoch": 1.1166519664382477, "grad_norm": 1.886540412902832, "learning_rate": 3.1389133892695874e-05, "loss": 0.5648, "step": 395000 }, { "epoch": 1.1180654499400684, "grad_norm": 1.9216201305389404, "learning_rate": 3.13655758343322e-05, "loss": 0.562, "step": 395500 }, { "epoch": 1.1194789334418889, "grad_norm": 1.7266943454742432, "learning_rate": 3.134201777596852e-05, "loss": 0.5648, "step": 396000 }, { "epoch": 1.1208924169437093, "grad_norm": 1.9458645582199097, "learning_rate": 3.1318459717604845e-05, "loss": 0.5655, "step": 396500 }, { "epoch": 1.12230590044553, "grad_norm": 1.7380398511886597, "learning_rate": 3.129490165924117e-05, "loss": 0.565, "step": 397000 }, { "epoch": 1.1237193839473505, "grad_norm": 1.82628333568573, "learning_rate": 3.127134360087749e-05, "loss": 0.5682, "step": 397500 }, { "epoch": 1.1251328674491712, "grad_norm": 1.9547462463378906, "learning_rate": 3.1247785542513816e-05, "loss": 0.5638, "step": 398000 }, { "epoch": 1.1265463509509916, "grad_norm": 1.817070484161377, "learning_rate": 3.122422748415014e-05, "loss": 0.5673, "step": 398500 }, { "epoch": 1.1279598344528123, "grad_norm": 1.9636874198913574, "learning_rate": 3.1200669425786464e-05, "loss": 0.5643, "step": 399000 }, { "epoch": 1.1293733179546328, "grad_norm": 1.7687526941299438, "learning_rate": 3.117711136742279e-05, "loss": 0.566, "step": 399500 }, { "epoch": 1.1307868014564535, "grad_norm": 2.023736000061035, "learning_rate": 3.115355330905911e-05, "loss": 0.5651, "step": 400000 }, { "epoch": 1.132200284958274, "grad_norm": 1.6987468004226685, "learning_rate": 3.1129995250695435e-05, "loss": 0.56, "step": 400500 }, { "epoch": 1.1336137684600947, "grad_norm": 1.9995132684707642, "learning_rate": 3.110643719233176e-05, "loss": 0.564, "step": 401000 }, { "epoch": 1.1350272519619151, "grad_norm": 1.7273468971252441, "learning_rate": 3.108287913396808e-05, "loss": 0.5646, "step": 401500 }, { "epoch": 1.1364407354637356, "grad_norm": 1.8252230882644653, "learning_rate": 3.1059321075604406e-05, "loss": 0.5594, "step": 402000 }, { "epoch": 1.1378542189655563, "grad_norm": 2.01387095451355, "learning_rate": 3.103576301724073e-05, "loss": 0.5626, "step": 402500 }, { "epoch": 1.1392677024673767, "grad_norm": 1.8901734352111816, "learning_rate": 3.101220495887705e-05, "loss": 0.5658, "step": 403000 }, { "epoch": 1.1406811859691974, "grad_norm": 1.8873628377914429, "learning_rate": 3.098864690051338e-05, "loss": 0.5617, "step": 403500 }, { "epoch": 1.142094669471018, "grad_norm": 1.9508055448532104, "learning_rate": 3.09650888421497e-05, "loss": 0.5591, "step": 404000 }, { "epoch": 1.1435081529728386, "grad_norm": 1.9081474542617798, "learning_rate": 3.0941530783786024e-05, "loss": 0.5663, "step": 404500 }, { "epoch": 1.144921636474659, "grad_norm": 1.8064162731170654, "learning_rate": 3.091797272542235e-05, "loss": 0.5646, "step": 405000 }, { "epoch": 1.1463351199764795, "grad_norm": 1.7851624488830566, "learning_rate": 3.089441466705867e-05, "loss": 0.5633, "step": 405500 }, { "epoch": 1.1477486034783002, "grad_norm": 1.872293472290039, "learning_rate": 3.0870856608694995e-05, "loss": 0.5638, "step": 406000 }, { "epoch": 1.1491620869801207, "grad_norm": 1.7447619438171387, "learning_rate": 3.084729855033132e-05, "loss": 0.5696, "step": 406500 }, { "epoch": 1.1505755704819414, "grad_norm": 1.7427351474761963, "learning_rate": 3.082374049196764e-05, "loss": 0.5651, "step": 407000 }, { "epoch": 1.1519890539837618, "grad_norm": 1.9975290298461914, "learning_rate": 3.0800182433603966e-05, "loss": 0.5653, "step": 407500 }, { "epoch": 1.1534025374855825, "grad_norm": 2.0143134593963623, "learning_rate": 3.077662437524029e-05, "loss": 0.565, "step": 408000 }, { "epoch": 1.154816020987403, "grad_norm": 1.8956010341644287, "learning_rate": 3.0753066316876614e-05, "loss": 0.5615, "step": 408500 }, { "epoch": 1.1562295044892237, "grad_norm": 1.7582226991653442, "learning_rate": 3.072950825851294e-05, "loss": 0.5592, "step": 409000 }, { "epoch": 1.1576429879910441, "grad_norm": 1.8083794116973877, "learning_rate": 3.070595020014926e-05, "loss": 0.5653, "step": 409500 }, { "epoch": 1.1590564714928648, "grad_norm": 1.7141129970550537, "learning_rate": 3.0682392141785585e-05, "loss": 0.5639, "step": 410000 }, { "epoch": 1.1604699549946853, "grad_norm": 1.9343377351760864, "learning_rate": 3.065883408342191e-05, "loss": 0.5644, "step": 410500 }, { "epoch": 1.1618834384965058, "grad_norm": 1.8747937679290771, "learning_rate": 3.063527602505823e-05, "loss": 0.5597, "step": 411000 }, { "epoch": 1.1632969219983265, "grad_norm": 1.9360160827636719, "learning_rate": 3.0611717966694556e-05, "loss": 0.5643, "step": 411500 }, { "epoch": 1.164710405500147, "grad_norm": 1.908011555671692, "learning_rate": 3.058815990833088e-05, "loss": 0.5603, "step": 412000 }, { "epoch": 1.1661238890019676, "grad_norm": 1.8895214796066284, "learning_rate": 3.05646018499672e-05, "loss": 0.5619, "step": 412500 }, { "epoch": 1.167537372503788, "grad_norm": 1.7850741147994995, "learning_rate": 3.0541043791603533e-05, "loss": 0.5637, "step": 413000 }, { "epoch": 1.1689508560056088, "grad_norm": 1.9580038785934448, "learning_rate": 3.051748573323986e-05, "loss": 0.5623, "step": 413500 }, { "epoch": 1.1703643395074292, "grad_norm": 1.948678970336914, "learning_rate": 3.0493927674876184e-05, "loss": 0.5617, "step": 414000 }, { "epoch": 1.17177782300925, "grad_norm": 1.6357256174087524, "learning_rate": 3.0470369616512508e-05, "loss": 0.5651, "step": 414500 }, { "epoch": 1.1731913065110704, "grad_norm": 1.7497944831848145, "learning_rate": 3.044681155814883e-05, "loss": 0.5581, "step": 415000 }, { "epoch": 1.174604790012891, "grad_norm": 1.953696846961975, "learning_rate": 3.0423253499785155e-05, "loss": 0.5587, "step": 415500 }, { "epoch": 1.1760182735147116, "grad_norm": 1.9111846685409546, "learning_rate": 3.039969544142148e-05, "loss": 0.56, "step": 416000 }, { "epoch": 1.177431757016532, "grad_norm": 2.003554344177246, "learning_rate": 3.0376137383057803e-05, "loss": 0.5606, "step": 416500 }, { "epoch": 1.1788452405183527, "grad_norm": 1.8863508701324463, "learning_rate": 3.0352579324694126e-05, "loss": 0.5629, "step": 417000 }, { "epoch": 1.1802587240201732, "grad_norm": 1.8781440258026123, "learning_rate": 3.032902126633045e-05, "loss": 0.5585, "step": 417500 }, { "epoch": 1.1816722075219939, "grad_norm": 1.8593615293502808, "learning_rate": 3.0305463207966774e-05, "loss": 0.5568, "step": 418000 }, { "epoch": 1.1830856910238143, "grad_norm": 1.9933748245239258, "learning_rate": 3.0281905149603097e-05, "loss": 0.5581, "step": 418500 }, { "epoch": 1.184499174525635, "grad_norm": 1.7915339469909668, "learning_rate": 3.025834709123942e-05, "loss": 0.5598, "step": 419000 }, { "epoch": 1.1859126580274555, "grad_norm": 1.899834394454956, "learning_rate": 3.0234789032875745e-05, "loss": 0.5576, "step": 419500 }, { "epoch": 1.187326141529276, "grad_norm": 1.9428986310958862, "learning_rate": 3.0211230974512068e-05, "loss": 0.5645, "step": 420000 }, { "epoch": 1.1887396250310966, "grad_norm": 1.8248592615127563, "learning_rate": 3.0187672916148392e-05, "loss": 0.5574, "step": 420500 }, { "epoch": 1.1901531085329171, "grad_norm": 1.7828431129455566, "learning_rate": 3.0164114857784716e-05, "loss": 0.5607, "step": 421000 }, { "epoch": 1.1915665920347378, "grad_norm": 1.9157642126083374, "learning_rate": 3.014055679942104e-05, "loss": 0.5599, "step": 421500 }, { "epoch": 1.1929800755365583, "grad_norm": 1.957305908203125, "learning_rate": 3.0116998741057363e-05, "loss": 0.5617, "step": 422000 }, { "epoch": 1.194393559038379, "grad_norm": 1.7812291383743286, "learning_rate": 3.0093440682693687e-05, "loss": 0.5601, "step": 422500 }, { "epoch": 1.1958070425401994, "grad_norm": 1.8594802618026733, "learning_rate": 3.006988262433001e-05, "loss": 0.5538, "step": 423000 }, { "epoch": 1.1972205260420201, "grad_norm": 1.8682737350463867, "learning_rate": 3.0046324565966334e-05, "loss": 0.562, "step": 423500 }, { "epoch": 1.1986340095438406, "grad_norm": 1.817044734954834, "learning_rate": 3.0022766507602658e-05, "loss": 0.5593, "step": 424000 }, { "epoch": 1.2000474930456613, "grad_norm": 1.8820414543151855, "learning_rate": 2.999920844923898e-05, "loss": 0.5597, "step": 424500 }, { "epoch": 1.2014609765474817, "grad_norm": 1.9172815084457397, "learning_rate": 2.9975650390875305e-05, "loss": 0.5577, "step": 425000 }, { "epoch": 1.2028744600493022, "grad_norm": 1.8443692922592163, "learning_rate": 2.995209233251163e-05, "loss": 0.5602, "step": 425500 }, { "epoch": 1.204287943551123, "grad_norm": 1.6127262115478516, "learning_rate": 2.9928534274147952e-05, "loss": 0.5574, "step": 426000 }, { "epoch": 1.2057014270529434, "grad_norm": 1.9710582494735718, "learning_rate": 2.9904976215784276e-05, "loss": 0.5567, "step": 426500 }, { "epoch": 1.207114910554764, "grad_norm": 3.488210678100586, "learning_rate": 2.98814181574206e-05, "loss": 0.5587, "step": 427000 }, { "epoch": 1.2085283940565845, "grad_norm": 1.723077416419983, "learning_rate": 2.9857860099056927e-05, "loss": 0.5632, "step": 427500 }, { "epoch": 1.2099418775584052, "grad_norm": 1.7851358652114868, "learning_rate": 2.983430204069325e-05, "loss": 0.5619, "step": 428000 }, { "epoch": 1.2113553610602257, "grad_norm": 1.8437340259552002, "learning_rate": 2.9810743982329574e-05, "loss": 0.5575, "step": 428500 }, { "epoch": 1.2127688445620464, "grad_norm": 1.8254203796386719, "learning_rate": 2.9787185923965898e-05, "loss": 0.5608, "step": 429000 }, { "epoch": 1.2141823280638668, "grad_norm": 1.8925209045410156, "learning_rate": 2.976362786560222e-05, "loss": 0.5583, "step": 429500 }, { "epoch": 1.2155958115656875, "grad_norm": 1.8084627389907837, "learning_rate": 2.9740069807238545e-05, "loss": 0.5597, "step": 430000 }, { "epoch": 1.217009295067508, "grad_norm": 1.8266175985336304, "learning_rate": 2.971651174887487e-05, "loss": 0.5632, "step": 430500 }, { "epoch": 1.2184227785693285, "grad_norm": 2.0048325061798096, "learning_rate": 2.9692953690511193e-05, "loss": 0.561, "step": 431000 }, { "epoch": 1.2198362620711491, "grad_norm": 1.937211036682129, "learning_rate": 2.9669395632147516e-05, "loss": 0.5588, "step": 431500 }, { "epoch": 1.2212497455729696, "grad_norm": 2.0769424438476562, "learning_rate": 2.964583757378384e-05, "loss": 0.558, "step": 432000 }, { "epoch": 1.2226632290747903, "grad_norm": 1.8893314599990845, "learning_rate": 2.9622279515420164e-05, "loss": 0.5503, "step": 432500 }, { "epoch": 1.2240767125766108, "grad_norm": 2.0230937004089355, "learning_rate": 2.9598721457056487e-05, "loss": 0.555, "step": 433000 }, { "epoch": 1.2254901960784315, "grad_norm": 1.9992939233779907, "learning_rate": 2.957516339869281e-05, "loss": 0.556, "step": 433500 }, { "epoch": 1.226903679580252, "grad_norm": 1.9107600450515747, "learning_rate": 2.9551605340329135e-05, "loss": 0.5556, "step": 434000 }, { "epoch": 1.2283171630820724, "grad_norm": 1.912546992301941, "learning_rate": 2.9528047281965458e-05, "loss": 0.5587, "step": 434500 }, { "epoch": 1.229730646583893, "grad_norm": 1.8283746242523193, "learning_rate": 2.9504489223601782e-05, "loss": 0.5572, "step": 435000 }, { "epoch": 1.2311441300857136, "grad_norm": 1.9601548910140991, "learning_rate": 2.9480931165238106e-05, "loss": 0.5562, "step": 435500 }, { "epoch": 1.2325576135875342, "grad_norm": 1.7548987865447998, "learning_rate": 2.945737310687443e-05, "loss": 0.5566, "step": 436000 }, { "epoch": 1.2339710970893547, "grad_norm": 2.0890002250671387, "learning_rate": 2.9433815048510753e-05, "loss": 0.5578, "step": 436500 }, { "epoch": 1.2353845805911754, "grad_norm": 1.8240549564361572, "learning_rate": 2.9410256990147077e-05, "loss": 0.5568, "step": 437000 }, { "epoch": 1.2367980640929959, "grad_norm": 1.7763612270355225, "learning_rate": 2.93866989317834e-05, "loss": 0.5578, "step": 437500 }, { "epoch": 1.2382115475948166, "grad_norm": 1.7548307180404663, "learning_rate": 2.9363140873419724e-05, "loss": 0.5533, "step": 438000 }, { "epoch": 1.239625031096637, "grad_norm": 1.876711130142212, "learning_rate": 2.9339582815056048e-05, "loss": 0.5539, "step": 438500 }, { "epoch": 1.2410385145984577, "grad_norm": 1.8037004470825195, "learning_rate": 2.931602475669237e-05, "loss": 0.5537, "step": 439000 }, { "epoch": 1.2424519981002782, "grad_norm": 1.924138069152832, "learning_rate": 2.9292466698328695e-05, "loss": 0.5543, "step": 439500 }, { "epoch": 1.2438654816020986, "grad_norm": 1.9263170957565308, "learning_rate": 2.926890863996502e-05, "loss": 0.5598, "step": 440000 }, { "epoch": 1.2452789651039193, "grad_norm": 2.0272207260131836, "learning_rate": 2.9245350581601342e-05, "loss": 0.559, "step": 440500 }, { "epoch": 1.2466924486057398, "grad_norm": 1.8373621702194214, "learning_rate": 2.922179252323767e-05, "loss": 0.5549, "step": 441000 }, { "epoch": 1.2481059321075605, "grad_norm": 2.173574686050415, "learning_rate": 2.9198234464873993e-05, "loss": 0.5561, "step": 441500 }, { "epoch": 1.249519415609381, "grad_norm": 1.8925998210906982, "learning_rate": 2.917467640651032e-05, "loss": 0.5573, "step": 442000 }, { "epoch": 1.2509328991112016, "grad_norm": 1.7318665981292725, "learning_rate": 2.9151118348146644e-05, "loss": 0.5562, "step": 442500 }, { "epoch": 1.2523463826130221, "grad_norm": 2.0995209217071533, "learning_rate": 2.9127560289782967e-05, "loss": 0.5566, "step": 443000 }, { "epoch": 1.2537598661148426, "grad_norm": 1.9292051792144775, "learning_rate": 2.910400223141929e-05, "loss": 0.5522, "step": 443500 }, { "epoch": 1.2551733496166633, "grad_norm": 1.7525311708450317, "learning_rate": 2.9080444173055615e-05, "loss": 0.555, "step": 444000 }, { "epoch": 1.256586833118484, "grad_norm": 1.6556669473648071, "learning_rate": 2.905688611469194e-05, "loss": 0.5516, "step": 444500 }, { "epoch": 1.2580003166203044, "grad_norm": 1.8059078454971313, "learning_rate": 2.9033328056328262e-05, "loss": 0.5537, "step": 445000 }, { "epoch": 1.259413800122125, "grad_norm": 1.8970226049423218, "learning_rate": 2.9009769997964586e-05, "loss": 0.5566, "step": 445500 }, { "epoch": 1.2608272836239456, "grad_norm": 1.7759108543395996, "learning_rate": 2.898621193960091e-05, "loss": 0.5519, "step": 446000 }, { "epoch": 1.262240767125766, "grad_norm": 1.9896800518035889, "learning_rate": 2.8962653881237233e-05, "loss": 0.5512, "step": 446500 }, { "epoch": 1.2636542506275867, "grad_norm": 1.8087400197982788, "learning_rate": 2.8939095822873557e-05, "loss": 0.5559, "step": 447000 }, { "epoch": 1.2650677341294072, "grad_norm": 1.7296284437179565, "learning_rate": 2.891553776450988e-05, "loss": 0.5532, "step": 447500 }, { "epoch": 1.266481217631228, "grad_norm": 1.8271386623382568, "learning_rate": 2.8891979706146204e-05, "loss": 0.5532, "step": 448000 }, { "epoch": 1.2678947011330484, "grad_norm": 1.9870095252990723, "learning_rate": 2.8868421647782528e-05, "loss": 0.5534, "step": 448500 }, { "epoch": 1.2693081846348688, "grad_norm": 1.9679888486862183, "learning_rate": 2.884486358941885e-05, "loss": 0.5545, "step": 449000 }, { "epoch": 1.2707216681366895, "grad_norm": 1.8013232946395874, "learning_rate": 2.8821305531055175e-05, "loss": 0.5542, "step": 449500 }, { "epoch": 1.2721351516385102, "grad_norm": 1.788169264793396, "learning_rate": 2.87977474726915e-05, "loss": 0.5506, "step": 450000 }, { "epoch": 1.2735486351403307, "grad_norm": 1.8005424737930298, "learning_rate": 2.8774189414327823e-05, "loss": 0.5521, "step": 450500 }, { "epoch": 1.2749621186421511, "grad_norm": 1.9664697647094727, "learning_rate": 2.8750631355964146e-05, "loss": 0.5489, "step": 451000 }, { "epoch": 1.2763756021439718, "grad_norm": 1.8248878717422485, "learning_rate": 2.872707329760047e-05, "loss": 0.5549, "step": 451500 }, { "epoch": 1.2777890856457923, "grad_norm": 1.873817801475525, "learning_rate": 2.8703515239236794e-05, "loss": 0.5572, "step": 452000 }, { "epoch": 1.279202569147613, "grad_norm": 1.9384874105453491, "learning_rate": 2.8679957180873117e-05, "loss": 0.5522, "step": 452500 }, { "epoch": 1.2806160526494335, "grad_norm": 1.8835883140563965, "learning_rate": 2.865639912250944e-05, "loss": 0.5521, "step": 453000 }, { "epoch": 1.2820295361512541, "grad_norm": 1.9352624416351318, "learning_rate": 2.8632841064145765e-05, "loss": 0.556, "step": 453500 }, { "epoch": 1.2834430196530746, "grad_norm": 1.813751220703125, "learning_rate": 2.8609283005782088e-05, "loss": 0.5511, "step": 454000 }, { "epoch": 1.284856503154895, "grad_norm": 1.8006011247634888, "learning_rate": 2.8585724947418412e-05, "loss": 0.5495, "step": 454500 }, { "epoch": 1.2862699866567158, "grad_norm": 1.8108090162277222, "learning_rate": 2.8562166889054736e-05, "loss": 0.5498, "step": 455000 }, { "epoch": 1.2876834701585362, "grad_norm": 1.8712856769561768, "learning_rate": 2.853860883069106e-05, "loss": 0.5506, "step": 455500 }, { "epoch": 1.289096953660357, "grad_norm": 1.7733298540115356, "learning_rate": 2.8515050772327383e-05, "loss": 0.5513, "step": 456000 }, { "epoch": 1.2905104371621774, "grad_norm": 2.0178093910217285, "learning_rate": 2.8491492713963713e-05, "loss": 0.549, "step": 456500 }, { "epoch": 1.291923920663998, "grad_norm": 1.9387502670288086, "learning_rate": 2.8467934655600037e-05, "loss": 0.5531, "step": 457000 }, { "epoch": 1.2933374041658185, "grad_norm": 1.7426408529281616, "learning_rate": 2.844437659723636e-05, "loss": 0.5504, "step": 457500 }, { "epoch": 1.294750887667639, "grad_norm": 1.9285242557525635, "learning_rate": 2.8420818538872684e-05, "loss": 0.5553, "step": 458000 }, { "epoch": 1.2961643711694597, "grad_norm": 1.9304567575454712, "learning_rate": 2.8397260480509008e-05, "loss": 0.5506, "step": 458500 }, { "epoch": 1.2975778546712804, "grad_norm": 1.9887248277664185, "learning_rate": 2.8373702422145332e-05, "loss": 0.5481, "step": 459000 }, { "epoch": 1.2989913381731009, "grad_norm": 2.0274765491485596, "learning_rate": 2.8350144363781655e-05, "loss": 0.5494, "step": 459500 }, { "epoch": 1.3004048216749213, "grad_norm": 1.8806848526000977, "learning_rate": 2.832658630541798e-05, "loss": 0.553, "step": 460000 }, { "epoch": 1.301818305176742, "grad_norm": 1.7800383567810059, "learning_rate": 2.8303028247054303e-05, "loss": 0.552, "step": 460500 }, { "epoch": 1.3032317886785625, "grad_norm": 1.7797297239303589, "learning_rate": 2.8279470188690627e-05, "loss": 0.5489, "step": 461000 }, { "epoch": 1.3046452721803832, "grad_norm": 1.8209893703460693, "learning_rate": 2.825591213032695e-05, "loss": 0.5514, "step": 461500 }, { "epoch": 1.3060587556822036, "grad_norm": 2.0098581314086914, "learning_rate": 2.8232354071963274e-05, "loss": 0.5531, "step": 462000 }, { "epoch": 1.3074722391840243, "grad_norm": 1.9471644163131714, "learning_rate": 2.8208796013599598e-05, "loss": 0.5556, "step": 462500 }, { "epoch": 1.3088857226858448, "grad_norm": 1.9235163927078247, "learning_rate": 2.818523795523592e-05, "loss": 0.5537, "step": 463000 }, { "epoch": 1.3102992061876653, "grad_norm": 1.7901712656021118, "learning_rate": 2.8161679896872245e-05, "loss": 0.5502, "step": 463500 }, { "epoch": 1.311712689689486, "grad_norm": 1.7299185991287231, "learning_rate": 2.813812183850857e-05, "loss": 0.5495, "step": 464000 }, { "epoch": 1.3131261731913066, "grad_norm": 1.882616400718689, "learning_rate": 2.8114563780144892e-05, "loss": 0.548, "step": 464500 }, { "epoch": 1.3145396566931271, "grad_norm": 1.917283058166504, "learning_rate": 2.8091005721781216e-05, "loss": 0.5495, "step": 465000 }, { "epoch": 1.3159531401949476, "grad_norm": 1.8400158882141113, "learning_rate": 2.806744766341754e-05, "loss": 0.5466, "step": 465500 }, { "epoch": 1.3173666236967683, "grad_norm": 1.8193550109863281, "learning_rate": 2.8043889605053863e-05, "loss": 0.5535, "step": 466000 }, { "epoch": 1.3187801071985887, "grad_norm": 1.830562710762024, "learning_rate": 2.8020331546690187e-05, "loss": 0.5494, "step": 466500 }, { "epoch": 1.3201935907004094, "grad_norm": 1.7912331819534302, "learning_rate": 2.799677348832651e-05, "loss": 0.5515, "step": 467000 }, { "epoch": 1.32160707420223, "grad_norm": 1.8949259519577026, "learning_rate": 2.7973215429962834e-05, "loss": 0.5476, "step": 467500 }, { "epoch": 1.3230205577040506, "grad_norm": 1.6309354305267334, "learning_rate": 2.7949657371599158e-05, "loss": 0.5493, "step": 468000 }, { "epoch": 1.324434041205871, "grad_norm": 1.9015100002288818, "learning_rate": 2.792609931323548e-05, "loss": 0.5505, "step": 468500 }, { "epoch": 1.3258475247076915, "grad_norm": 1.7936887741088867, "learning_rate": 2.7902541254871805e-05, "loss": 0.5507, "step": 469000 }, { "epoch": 1.3272610082095122, "grad_norm": 1.909964680671692, "learning_rate": 2.787898319650813e-05, "loss": 0.5497, "step": 469500 }, { "epoch": 1.3286744917113327, "grad_norm": 1.8636785745620728, "learning_rate": 2.7855425138144453e-05, "loss": 0.5527, "step": 470000 }, { "epoch": 1.3300879752131534, "grad_norm": 1.8510348796844482, "learning_rate": 2.7831867079780776e-05, "loss": 0.5472, "step": 470500 }, { "epoch": 1.3315014587149738, "grad_norm": 1.9420288801193237, "learning_rate": 2.7808309021417107e-05, "loss": 0.5484, "step": 471000 }, { "epoch": 1.3329149422167945, "grad_norm": 1.853553056716919, "learning_rate": 2.778475096305343e-05, "loss": 0.5468, "step": 471500 }, { "epoch": 1.334328425718615, "grad_norm": 1.9472386837005615, "learning_rate": 2.7761192904689754e-05, "loss": 0.5524, "step": 472000 }, { "epoch": 1.3357419092204355, "grad_norm": 1.83768892288208, "learning_rate": 2.7737634846326078e-05, "loss": 0.5426, "step": 472500 }, { "epoch": 1.3371553927222561, "grad_norm": 1.895118236541748, "learning_rate": 2.77140767879624e-05, "loss": 0.545, "step": 473000 }, { "epoch": 1.3385688762240768, "grad_norm": 1.872310996055603, "learning_rate": 2.7690518729598725e-05, "loss": 0.5488, "step": 473500 }, { "epoch": 1.3399823597258973, "grad_norm": 1.654121994972229, "learning_rate": 2.766696067123505e-05, "loss": 0.5504, "step": 474000 }, { "epoch": 1.3413958432277178, "grad_norm": 1.9186142683029175, "learning_rate": 2.7643402612871372e-05, "loss": 0.5482, "step": 474500 }, { "epoch": 1.3428093267295385, "grad_norm": 1.7632867097854614, "learning_rate": 2.7619844554507696e-05, "loss": 0.5475, "step": 475000 }, { "epoch": 1.344222810231359, "grad_norm": 1.7896806001663208, "learning_rate": 2.759628649614402e-05, "loss": 0.5501, "step": 475500 }, { "epoch": 1.3456362937331796, "grad_norm": 1.9777891635894775, "learning_rate": 2.7572728437780344e-05, "loss": 0.5476, "step": 476000 }, { "epoch": 1.347049777235, "grad_norm": 1.7893006801605225, "learning_rate": 2.7549170379416667e-05, "loss": 0.5456, "step": 476500 }, { "epoch": 1.3484632607368208, "grad_norm": 1.9832487106323242, "learning_rate": 2.752561232105299e-05, "loss": 0.5435, "step": 477000 }, { "epoch": 1.3498767442386412, "grad_norm": 1.8393890857696533, "learning_rate": 2.7502054262689315e-05, "loss": 0.5462, "step": 477500 }, { "epoch": 1.3512902277404617, "grad_norm": 1.9267538785934448, "learning_rate": 2.7478496204325638e-05, "loss": 0.5453, "step": 478000 }, { "epoch": 1.3527037112422824, "grad_norm": 2.01885986328125, "learning_rate": 2.7454938145961962e-05, "loss": 0.5468, "step": 478500 }, { "epoch": 1.354117194744103, "grad_norm": 2.0099093914031982, "learning_rate": 2.7431380087598286e-05, "loss": 0.5453, "step": 479000 }, { "epoch": 1.3555306782459235, "grad_norm": 1.835996389389038, "learning_rate": 2.740782202923461e-05, "loss": 0.5433, "step": 479500 }, { "epoch": 1.356944161747744, "grad_norm": 1.9237440824508667, "learning_rate": 2.7384263970870933e-05, "loss": 0.5453, "step": 480000 }, { "epoch": 1.3583576452495647, "grad_norm": 1.8566088676452637, "learning_rate": 2.7360705912507257e-05, "loss": 0.5449, "step": 480500 }, { "epoch": 1.3597711287513852, "grad_norm": 1.7944880723953247, "learning_rate": 2.733714785414358e-05, "loss": 0.5478, "step": 481000 }, { "epoch": 1.3611846122532059, "grad_norm": 1.932460904121399, "learning_rate": 2.7313589795779904e-05, "loss": 0.5465, "step": 481500 }, { "epoch": 1.3625980957550263, "grad_norm": 1.9200137853622437, "learning_rate": 2.7290031737416228e-05, "loss": 0.5451, "step": 482000 }, { "epoch": 1.364011579256847, "grad_norm": 1.8001668453216553, "learning_rate": 2.726647367905255e-05, "loss": 0.5463, "step": 482500 }, { "epoch": 1.3654250627586675, "grad_norm": 2.100968360900879, "learning_rate": 2.7242915620688875e-05, "loss": 0.5468, "step": 483000 }, { "epoch": 1.366838546260488, "grad_norm": 1.8190577030181885, "learning_rate": 2.72193575623252e-05, "loss": 0.5479, "step": 483500 }, { "epoch": 1.3682520297623086, "grad_norm": 1.8856805562973022, "learning_rate": 2.7195799503961522e-05, "loss": 0.5454, "step": 484000 }, { "epoch": 1.369665513264129, "grad_norm": 1.8566054105758667, "learning_rate": 2.7172241445597846e-05, "loss": 0.5472, "step": 484500 }, { "epoch": 1.3710789967659498, "grad_norm": 1.8694363832473755, "learning_rate": 2.714868338723417e-05, "loss": 0.5485, "step": 485000 }, { "epoch": 1.3724924802677703, "grad_norm": 1.848070502281189, "learning_rate": 2.71251253288705e-05, "loss": 0.5458, "step": 485500 }, { "epoch": 1.373905963769591, "grad_norm": 1.7457643747329712, "learning_rate": 2.7101567270506824e-05, "loss": 0.5454, "step": 486000 }, { "epoch": 1.3753194472714114, "grad_norm": 1.691705346107483, "learning_rate": 2.7078009212143147e-05, "loss": 0.5435, "step": 486500 }, { "epoch": 1.3767329307732319, "grad_norm": 1.8890533447265625, "learning_rate": 2.705445115377947e-05, "loss": 0.5447, "step": 487000 }, { "epoch": 1.3781464142750526, "grad_norm": 1.8327594995498657, "learning_rate": 2.7030893095415795e-05, "loss": 0.543, "step": 487500 }, { "epoch": 1.3795598977768733, "grad_norm": 1.7901325225830078, "learning_rate": 2.700733503705212e-05, "loss": 0.5449, "step": 488000 }, { "epoch": 1.3809733812786937, "grad_norm": 1.9382356405258179, "learning_rate": 2.6983776978688442e-05, "loss": 0.5434, "step": 488500 }, { "epoch": 1.3823868647805142, "grad_norm": 1.8815854787826538, "learning_rate": 2.6960218920324766e-05, "loss": 0.5499, "step": 489000 }, { "epoch": 1.383800348282335, "grad_norm": 1.9799134731292725, "learning_rate": 2.693666086196109e-05, "loss": 0.5425, "step": 489500 }, { "epoch": 1.3852138317841554, "grad_norm": 1.9268486499786377, "learning_rate": 2.6913102803597413e-05, "loss": 0.5497, "step": 490000 }, { "epoch": 1.386627315285976, "grad_norm": 1.590950846672058, "learning_rate": 2.6889544745233737e-05, "loss": 0.5411, "step": 490500 }, { "epoch": 1.3880407987877965, "grad_norm": 2.126508951187134, "learning_rate": 2.686598668687006e-05, "loss": 0.5471, "step": 491000 }, { "epoch": 1.3894542822896172, "grad_norm": 1.7696080207824707, "learning_rate": 2.6842428628506384e-05, "loss": 0.5467, "step": 491500 }, { "epoch": 1.3908677657914377, "grad_norm": 1.8131080865859985, "learning_rate": 2.6818870570142708e-05, "loss": 0.5474, "step": 492000 }, { "epoch": 1.3922812492932581, "grad_norm": 1.8530930280685425, "learning_rate": 2.679531251177903e-05, "loss": 0.5441, "step": 492500 }, { "epoch": 1.3936947327950788, "grad_norm": 1.905764102935791, "learning_rate": 2.6771754453415355e-05, "loss": 0.5449, "step": 493000 }, { "epoch": 1.3951082162968995, "grad_norm": 1.8213846683502197, "learning_rate": 2.674819639505168e-05, "loss": 0.5475, "step": 493500 }, { "epoch": 1.39652169979872, "grad_norm": 1.997665286064148, "learning_rate": 2.6724638336688003e-05, "loss": 0.5442, "step": 494000 }, { "epoch": 1.3979351833005405, "grad_norm": 1.9124455451965332, "learning_rate": 2.6701080278324326e-05, "loss": 0.5459, "step": 494500 }, { "epoch": 1.3993486668023611, "grad_norm": 2.0003795623779297, "learning_rate": 2.667752221996065e-05, "loss": 0.5436, "step": 495000 }, { "epoch": 1.4007621503041816, "grad_norm": 1.770445704460144, "learning_rate": 2.6653964161596974e-05, "loss": 0.5424, "step": 495500 }, { "epoch": 1.4021756338060023, "grad_norm": 2.116727113723755, "learning_rate": 2.6630406103233297e-05, "loss": 0.5445, "step": 496000 }, { "epoch": 1.4035891173078228, "grad_norm": 1.8007227182388306, "learning_rate": 2.660684804486962e-05, "loss": 0.5418, "step": 496500 }, { "epoch": 1.4050026008096435, "grad_norm": 1.9678304195404053, "learning_rate": 2.6583289986505945e-05, "loss": 0.5421, "step": 497000 }, { "epoch": 1.406416084311464, "grad_norm": 1.9536603689193726, "learning_rate": 2.6559731928142268e-05, "loss": 0.5403, "step": 497500 }, { "epoch": 1.4078295678132844, "grad_norm": 1.848536491394043, "learning_rate": 2.6536173869778592e-05, "loss": 0.5443, "step": 498000 }, { "epoch": 1.409243051315105, "grad_norm": 2.0253751277923584, "learning_rate": 2.6512615811414916e-05, "loss": 0.5393, "step": 498500 }, { "epoch": 1.4106565348169255, "grad_norm": 1.8042397499084473, "learning_rate": 2.648905775305124e-05, "loss": 0.5427, "step": 499000 }, { "epoch": 1.4120700183187462, "grad_norm": 2.016324281692505, "learning_rate": 2.6465499694687563e-05, "loss": 0.5428, "step": 499500 }, { "epoch": 1.4134835018205667, "grad_norm": 1.6338798999786377, "learning_rate": 2.644194163632389e-05, "loss": 0.543, "step": 500000 }, { "epoch": 1.4148969853223874, "grad_norm": 1.7524570226669312, "learning_rate": 2.6418383577960214e-05, "loss": 0.5434, "step": 500500 }, { "epoch": 1.4163104688242079, "grad_norm": 1.9059929847717285, "learning_rate": 2.6394825519596537e-05, "loss": 0.5409, "step": 501000 }, { "epoch": 1.4177239523260283, "grad_norm": 1.9898536205291748, "learning_rate": 2.637126746123286e-05, "loss": 0.5498, "step": 501500 }, { "epoch": 1.419137435827849, "grad_norm": 1.715096354484558, "learning_rate": 2.6347709402869185e-05, "loss": 0.5416, "step": 502000 }, { "epoch": 1.4205509193296697, "grad_norm": 1.8067779541015625, "learning_rate": 2.632415134450551e-05, "loss": 0.543, "step": 502500 }, { "epoch": 1.4219644028314902, "grad_norm": 1.5921577215194702, "learning_rate": 2.6300593286141832e-05, "loss": 0.5461, "step": 503000 }, { "epoch": 1.4233778863333106, "grad_norm": 1.9253058433532715, "learning_rate": 2.6277035227778156e-05, "loss": 0.5369, "step": 503500 }, { "epoch": 1.4247913698351313, "grad_norm": 1.8958940505981445, "learning_rate": 2.625347716941448e-05, "loss": 0.542, "step": 504000 }, { "epoch": 1.4262048533369518, "grad_norm": 1.9172489643096924, "learning_rate": 2.6229919111050803e-05, "loss": 0.5412, "step": 504500 }, { "epoch": 1.4276183368387725, "grad_norm": 1.8909324407577515, "learning_rate": 2.6206361052687127e-05, "loss": 0.5435, "step": 505000 }, { "epoch": 1.429031820340593, "grad_norm": 1.7988373041152954, "learning_rate": 2.618280299432345e-05, "loss": 0.5425, "step": 505500 }, { "epoch": 1.4304453038424136, "grad_norm": 1.8330591917037964, "learning_rate": 2.6159244935959774e-05, "loss": 0.5404, "step": 506000 }, { "epoch": 1.431858787344234, "grad_norm": 2.003727436065674, "learning_rate": 2.6135686877596098e-05, "loss": 0.542, "step": 506500 }, { "epoch": 1.4332722708460546, "grad_norm": 1.8330305814743042, "learning_rate": 2.611212881923242e-05, "loss": 0.5449, "step": 507000 }, { "epoch": 1.4346857543478753, "grad_norm": 1.7033838033676147, "learning_rate": 2.608857076086875e-05, "loss": 0.538, "step": 507500 }, { "epoch": 1.436099237849696, "grad_norm": 1.7643749713897705, "learning_rate": 2.6065012702505072e-05, "loss": 0.5398, "step": 508000 }, { "epoch": 1.4375127213515164, "grad_norm": 1.8887012004852295, "learning_rate": 2.6041454644141396e-05, "loss": 0.5417, "step": 508500 }, { "epoch": 1.4389262048533369, "grad_norm": 1.898247241973877, "learning_rate": 2.601789658577772e-05, "loss": 0.5411, "step": 509000 }, { "epoch": 1.4403396883551576, "grad_norm": 1.6611384153366089, "learning_rate": 2.5994338527414043e-05, "loss": 0.5468, "step": 509500 }, { "epoch": 1.441753171856978, "grad_norm": 1.8461030721664429, "learning_rate": 2.5970780469050367e-05, "loss": 0.5418, "step": 510000 }, { "epoch": 1.4431666553587987, "grad_norm": 1.8420064449310303, "learning_rate": 2.594722241068669e-05, "loss": 0.5448, "step": 510500 }, { "epoch": 1.4445801388606192, "grad_norm": 1.7131075859069824, "learning_rate": 2.5923664352323014e-05, "loss": 0.5446, "step": 511000 }, { "epoch": 1.44599362236244, "grad_norm": 1.9686027765274048, "learning_rate": 2.5900106293959338e-05, "loss": 0.5411, "step": 511500 }, { "epoch": 1.4474071058642604, "grad_norm": 1.6839299201965332, "learning_rate": 2.587654823559566e-05, "loss": 0.5401, "step": 512000 }, { "epoch": 1.4488205893660808, "grad_norm": 1.8886562585830688, "learning_rate": 2.5852990177231985e-05, "loss": 0.5389, "step": 512500 }, { "epoch": 1.4502340728679015, "grad_norm": 1.8695608377456665, "learning_rate": 2.582943211886831e-05, "loss": 0.5423, "step": 513000 }, { "epoch": 1.451647556369722, "grad_norm": 1.8647687435150146, "learning_rate": 2.5805874060504633e-05, "loss": 0.5373, "step": 513500 }, { "epoch": 1.4530610398715427, "grad_norm": 1.8718979358673096, "learning_rate": 2.5782316002140956e-05, "loss": 0.5402, "step": 514000 }, { "epoch": 1.4544745233733631, "grad_norm": 1.7302871942520142, "learning_rate": 2.5758757943777283e-05, "loss": 0.5392, "step": 514500 }, { "epoch": 1.4558880068751838, "grad_norm": 1.9062303304672241, "learning_rate": 2.5735199885413607e-05, "loss": 0.5425, "step": 515000 }, { "epoch": 1.4573014903770043, "grad_norm": 1.8656256198883057, "learning_rate": 2.571164182704993e-05, "loss": 0.541, "step": 515500 }, { "epoch": 1.4587149738788248, "grad_norm": 1.8922547101974487, "learning_rate": 2.5688083768686254e-05, "loss": 0.5439, "step": 516000 }, { "epoch": 1.4601284573806455, "grad_norm": 1.9779348373413086, "learning_rate": 2.5664525710322578e-05, "loss": 0.5369, "step": 516500 }, { "epoch": 1.4615419408824661, "grad_norm": 2.1146984100341797, "learning_rate": 2.5640967651958902e-05, "loss": 0.5396, "step": 517000 }, { "epoch": 1.4629554243842866, "grad_norm": 1.7519282102584839, "learning_rate": 2.5617409593595225e-05, "loss": 0.5406, "step": 517500 }, { "epoch": 1.464368907886107, "grad_norm": 1.9377520084381104, "learning_rate": 2.559385153523155e-05, "loss": 0.5378, "step": 518000 }, { "epoch": 1.4657823913879278, "grad_norm": 1.856484055519104, "learning_rate": 2.5570293476867873e-05, "loss": 0.5416, "step": 518500 }, { "epoch": 1.4671958748897482, "grad_norm": 1.9151662588119507, "learning_rate": 2.5546735418504196e-05, "loss": 0.5354, "step": 519000 }, { "epoch": 1.468609358391569, "grad_norm": 1.9235942363739014, "learning_rate": 2.552317736014052e-05, "loss": 0.5423, "step": 519500 }, { "epoch": 1.4700228418933894, "grad_norm": 1.9041539430618286, "learning_rate": 2.5499619301776844e-05, "loss": 0.5398, "step": 520000 }, { "epoch": 1.47143632539521, "grad_norm": 1.948202133178711, "learning_rate": 2.5476061243413167e-05, "loss": 0.5397, "step": 520500 }, { "epoch": 1.4728498088970305, "grad_norm": 1.8879996538162231, "learning_rate": 2.545250318504949e-05, "loss": 0.5381, "step": 521000 }, { "epoch": 1.474263292398851, "grad_norm": 1.8301236629486084, "learning_rate": 2.5428945126685815e-05, "loss": 0.5408, "step": 521500 }, { "epoch": 1.4756767759006717, "grad_norm": 1.9326231479644775, "learning_rate": 2.540538706832214e-05, "loss": 0.5385, "step": 522000 }, { "epoch": 1.4770902594024924, "grad_norm": 1.9669442176818848, "learning_rate": 2.5381829009958462e-05, "loss": 0.5426, "step": 522500 }, { "epoch": 1.4785037429043129, "grad_norm": 1.7713794708251953, "learning_rate": 2.5358270951594786e-05, "loss": 0.5392, "step": 523000 }, { "epoch": 1.4799172264061333, "grad_norm": 1.7998411655426025, "learning_rate": 2.533471289323111e-05, "loss": 0.5397, "step": 523500 }, { "epoch": 1.481330709907954, "grad_norm": 1.9299441576004028, "learning_rate": 2.5311154834867433e-05, "loss": 0.54, "step": 524000 }, { "epoch": 1.4827441934097745, "grad_norm": 1.7812176942825317, "learning_rate": 2.5287596776503757e-05, "loss": 0.545, "step": 524500 }, { "epoch": 1.4841576769115952, "grad_norm": 1.9402521848678589, "learning_rate": 2.526403871814008e-05, "loss": 0.5391, "step": 525000 }, { "epoch": 1.4855711604134156, "grad_norm": 1.8655173778533936, "learning_rate": 2.5240480659776404e-05, "loss": 0.5397, "step": 525500 }, { "epoch": 1.4869846439152363, "grad_norm": 1.850485920906067, "learning_rate": 2.5216922601412728e-05, "loss": 0.5359, "step": 526000 }, { "epoch": 1.4883981274170568, "grad_norm": 1.770648717880249, "learning_rate": 2.519336454304905e-05, "loss": 0.5351, "step": 526500 }, { "epoch": 1.4898116109188773, "grad_norm": 1.6586956977844238, "learning_rate": 2.5169806484685375e-05, "loss": 0.5362, "step": 527000 }, { "epoch": 1.491225094420698, "grad_norm": 1.8786216974258423, "learning_rate": 2.51462484263217e-05, "loss": 0.5364, "step": 527500 }, { "epoch": 1.4926385779225184, "grad_norm": 1.8393075466156006, "learning_rate": 2.5122690367958023e-05, "loss": 0.5408, "step": 528000 }, { "epoch": 1.494052061424339, "grad_norm": 1.729907751083374, "learning_rate": 2.5099132309594346e-05, "loss": 0.5387, "step": 528500 }, { "epoch": 1.4954655449261596, "grad_norm": 1.6915360689163208, "learning_rate": 2.5075574251230677e-05, "loss": 0.5336, "step": 529000 }, { "epoch": 1.4968790284279803, "grad_norm": 1.84762442111969, "learning_rate": 2.5052016192867e-05, "loss": 0.5381, "step": 529500 }, { "epoch": 1.4982925119298007, "grad_norm": 1.7360621690750122, "learning_rate": 2.5028458134503324e-05, "loss": 0.5367, "step": 530000 }, { "epoch": 1.4997059954316212, "grad_norm": 1.9734480381011963, "learning_rate": 2.5004900076139648e-05, "loss": 0.5352, "step": 530500 }, { "epoch": 1.5011194789334419, "grad_norm": 1.8794888257980347, "learning_rate": 2.4981342017775968e-05, "loss": 0.5383, "step": 531000 }, { "epoch": 1.5025329624352626, "grad_norm": 1.923997163772583, "learning_rate": 2.495778395941229e-05, "loss": 0.5368, "step": 531500 }, { "epoch": 1.503946445937083, "grad_norm": 1.838708519935608, "learning_rate": 2.4934225901048615e-05, "loss": 0.5384, "step": 532000 }, { "epoch": 1.5053599294389035, "grad_norm": 1.8883566856384277, "learning_rate": 2.4910667842684942e-05, "loss": 0.5357, "step": 532500 }, { "epoch": 1.5067734129407242, "grad_norm": 1.962389349937439, "learning_rate": 2.4887109784321266e-05, "loss": 0.5389, "step": 533000 }, { "epoch": 1.5081868964425449, "grad_norm": 1.9272047281265259, "learning_rate": 2.486355172595759e-05, "loss": 0.5369, "step": 533500 }, { "epoch": 1.5096003799443651, "grad_norm": 2.072408676147461, "learning_rate": 2.4839993667593913e-05, "loss": 0.5371, "step": 534000 }, { "epoch": 1.5110138634461858, "grad_norm": 2.155419111251831, "learning_rate": 2.4816435609230237e-05, "loss": 0.5315, "step": 534500 }, { "epoch": 1.5124273469480065, "grad_norm": 1.7287839651107788, "learning_rate": 2.479287755086656e-05, "loss": 0.5346, "step": 535000 }, { "epoch": 1.513840830449827, "grad_norm": 1.7251795530319214, "learning_rate": 2.4769319492502884e-05, "loss": 0.5352, "step": 535500 }, { "epoch": 1.5152543139516474, "grad_norm": 1.7699825763702393, "learning_rate": 2.4745761434139208e-05, "loss": 0.5389, "step": 536000 }, { "epoch": 1.5166677974534681, "grad_norm": 1.8648276329040527, "learning_rate": 2.4722203375775532e-05, "loss": 0.5377, "step": 536500 }, { "epoch": 1.5180812809552888, "grad_norm": 1.9157524108886719, "learning_rate": 2.4698645317411855e-05, "loss": 0.5365, "step": 537000 }, { "epoch": 1.5194947644571093, "grad_norm": 1.7665483951568604, "learning_rate": 2.467508725904818e-05, "loss": 0.5416, "step": 537500 }, { "epoch": 1.5209082479589298, "grad_norm": 1.9164234399795532, "learning_rate": 2.4651529200684503e-05, "loss": 0.5368, "step": 538000 }, { "epoch": 1.5223217314607504, "grad_norm": 1.7528952360153198, "learning_rate": 2.4627971142320827e-05, "loss": 0.5345, "step": 538500 }, { "epoch": 1.523735214962571, "grad_norm": 1.7764593362808228, "learning_rate": 2.460441308395715e-05, "loss": 0.5317, "step": 539000 }, { "epoch": 1.5251486984643914, "grad_norm": 2.0968079566955566, "learning_rate": 2.4580855025593474e-05, "loss": 0.5334, "step": 539500 }, { "epoch": 1.526562181966212, "grad_norm": 1.8412081003189087, "learning_rate": 2.45572969672298e-05, "loss": 0.5273, "step": 540000 }, { "epoch": 1.5279756654680328, "grad_norm": 2.029167652130127, "learning_rate": 2.4533738908866125e-05, "loss": 0.5342, "step": 540500 }, { "epoch": 1.5293891489698532, "grad_norm": 1.8208391666412354, "learning_rate": 2.4510180850502448e-05, "loss": 0.5391, "step": 541000 }, { "epoch": 1.5308026324716737, "grad_norm": 1.9079804420471191, "learning_rate": 2.4486622792138772e-05, "loss": 0.5384, "step": 541500 }, { "epoch": 1.5322161159734944, "grad_norm": 1.8795431852340698, "learning_rate": 2.4463064733775096e-05, "loss": 0.5389, "step": 542000 }, { "epoch": 1.533629599475315, "grad_norm": 1.8007110357284546, "learning_rate": 2.443950667541142e-05, "loss": 0.5345, "step": 542500 }, { "epoch": 1.5350430829771355, "grad_norm": 1.9617351293563843, "learning_rate": 2.4415948617047743e-05, "loss": 0.5337, "step": 543000 }, { "epoch": 1.536456566478956, "grad_norm": 1.8907381296157837, "learning_rate": 2.4392390558684067e-05, "loss": 0.5361, "step": 543500 }, { "epoch": 1.5378700499807767, "grad_norm": 1.854415774345398, "learning_rate": 2.436883250032039e-05, "loss": 0.5332, "step": 544000 }, { "epoch": 1.5392835334825972, "grad_norm": 1.9486887454986572, "learning_rate": 2.4345274441956714e-05, "loss": 0.5358, "step": 544500 }, { "epoch": 1.5406970169844176, "grad_norm": 1.9131101369857788, "learning_rate": 2.4321716383593038e-05, "loss": 0.5376, "step": 545000 }, { "epoch": 1.5421105004862383, "grad_norm": 1.9263460636138916, "learning_rate": 2.429815832522936e-05, "loss": 0.5309, "step": 545500 }, { "epoch": 1.543523983988059, "grad_norm": 2.0943267345428467, "learning_rate": 2.4274600266865685e-05, "loss": 0.5344, "step": 546000 }, { "epoch": 1.5449374674898795, "grad_norm": 1.8945626020431519, "learning_rate": 2.425104220850201e-05, "loss": 0.5329, "step": 546500 }, { "epoch": 1.5463509509917, "grad_norm": 1.879412055015564, "learning_rate": 2.4227484150138336e-05, "loss": 0.5385, "step": 547000 }, { "epoch": 1.5477644344935206, "grad_norm": 2.113917350769043, "learning_rate": 2.420392609177466e-05, "loss": 0.5341, "step": 547500 }, { "epoch": 1.5491779179953413, "grad_norm": 1.8297146558761597, "learning_rate": 2.4180368033410983e-05, "loss": 0.5333, "step": 548000 }, { "epoch": 1.5505914014971616, "grad_norm": 1.7997350692749023, "learning_rate": 2.4156809975047307e-05, "loss": 0.5364, "step": 548500 }, { "epoch": 1.5520048849989823, "grad_norm": 1.9162224531173706, "learning_rate": 2.413325191668363e-05, "loss": 0.5369, "step": 549000 }, { "epoch": 1.553418368500803, "grad_norm": 1.8541229963302612, "learning_rate": 2.4109693858319954e-05, "loss": 0.5324, "step": 549500 }, { "epoch": 1.5548318520026234, "grad_norm": 2.036745071411133, "learning_rate": 2.4086135799956278e-05, "loss": 0.5309, "step": 550000 }, { "epoch": 1.5562453355044439, "grad_norm": 1.8736358880996704, "learning_rate": 2.40625777415926e-05, "loss": 0.5314, "step": 550500 }, { "epoch": 1.5576588190062646, "grad_norm": 2.0608348846435547, "learning_rate": 2.4039019683228925e-05, "loss": 0.5322, "step": 551000 }, { "epoch": 1.5590723025080853, "grad_norm": 1.957895278930664, "learning_rate": 2.401546162486525e-05, "loss": 0.5331, "step": 551500 }, { "epoch": 1.5604857860099057, "grad_norm": 1.8063135147094727, "learning_rate": 2.3991903566501572e-05, "loss": 0.5312, "step": 552000 }, { "epoch": 1.5618992695117262, "grad_norm": 1.9620682001113892, "learning_rate": 2.3968345508137896e-05, "loss": 0.534, "step": 552500 }, { "epoch": 1.5633127530135469, "grad_norm": 1.9069281816482544, "learning_rate": 2.394478744977422e-05, "loss": 0.5345, "step": 553000 }, { "epoch": 1.5647262365153674, "grad_norm": 1.7852174043655396, "learning_rate": 2.3921229391410544e-05, "loss": 0.5317, "step": 553500 }, { "epoch": 1.5661397200171878, "grad_norm": 2.1418020725250244, "learning_rate": 2.3897671333046867e-05, "loss": 0.533, "step": 554000 }, { "epoch": 1.5675532035190085, "grad_norm": 17.7861385345459, "learning_rate": 2.3874113274683194e-05, "loss": 0.5322, "step": 554500 }, { "epoch": 1.5689666870208292, "grad_norm": 1.7812633514404297, "learning_rate": 2.3850555216319518e-05, "loss": 0.5363, "step": 555000 }, { "epoch": 1.5703801705226497, "grad_norm": 1.781825304031372, "learning_rate": 2.382699715795584e-05, "loss": 0.5383, "step": 555500 }, { "epoch": 1.5717936540244701, "grad_norm": 1.77384614944458, "learning_rate": 2.3803439099592165e-05, "loss": 0.5321, "step": 556000 }, { "epoch": 1.5732071375262908, "grad_norm": 1.698797583580017, "learning_rate": 2.377988104122849e-05, "loss": 0.5307, "step": 556500 }, { "epoch": 1.5746206210281115, "grad_norm": 1.8035169839859009, "learning_rate": 2.3756322982864813e-05, "loss": 0.5354, "step": 557000 }, { "epoch": 1.576034104529932, "grad_norm": 1.9704819917678833, "learning_rate": 2.3732764924501136e-05, "loss": 0.5366, "step": 557500 }, { "epoch": 1.5774475880317524, "grad_norm": 1.9084705114364624, "learning_rate": 2.370920686613746e-05, "loss": 0.5325, "step": 558000 }, { "epoch": 1.5788610715335731, "grad_norm": 1.883638620376587, "learning_rate": 2.3685648807773784e-05, "loss": 0.5362, "step": 558500 }, { "epoch": 1.5802745550353936, "grad_norm": 1.8003673553466797, "learning_rate": 2.3662090749410107e-05, "loss": 0.5348, "step": 559000 }, { "epoch": 1.581688038537214, "grad_norm": 1.698879599571228, "learning_rate": 2.363853269104643e-05, "loss": 0.5313, "step": 559500 }, { "epoch": 1.5831015220390348, "grad_norm": 1.8816334009170532, "learning_rate": 2.3614974632682755e-05, "loss": 0.5268, "step": 560000 }, { "epoch": 1.5845150055408554, "grad_norm": 1.8327192068099976, "learning_rate": 2.359141657431908e-05, "loss": 0.5309, "step": 560500 }, { "epoch": 1.585928489042676, "grad_norm": 1.960365891456604, "learning_rate": 2.3567858515955402e-05, "loss": 0.5376, "step": 561000 }, { "epoch": 1.5873419725444964, "grad_norm": 1.8384820222854614, "learning_rate": 2.354430045759173e-05, "loss": 0.5337, "step": 561500 }, { "epoch": 1.588755456046317, "grad_norm": 1.7398725748062134, "learning_rate": 2.3520742399228053e-05, "loss": 0.5317, "step": 562000 }, { "epoch": 1.5901689395481375, "grad_norm": 1.8271832466125488, "learning_rate": 2.3497184340864376e-05, "loss": 0.5307, "step": 562500 }, { "epoch": 1.591582423049958, "grad_norm": 2.2447891235351562, "learning_rate": 2.34736262825007e-05, "loss": 0.5333, "step": 563000 }, { "epoch": 1.5929959065517787, "grad_norm": 1.8887901306152344, "learning_rate": 2.3450068224137024e-05, "loss": 0.5363, "step": 563500 }, { "epoch": 1.5944093900535994, "grad_norm": 1.8855302333831787, "learning_rate": 2.3426510165773347e-05, "loss": 0.5341, "step": 564000 }, { "epoch": 1.5958228735554199, "grad_norm": 1.9296878576278687, "learning_rate": 2.340295210740967e-05, "loss": 0.5326, "step": 564500 }, { "epoch": 1.5972363570572403, "grad_norm": 1.855703353881836, "learning_rate": 2.3379394049045995e-05, "loss": 0.5292, "step": 565000 }, { "epoch": 1.598649840559061, "grad_norm": 2.0868313312530518, "learning_rate": 2.335583599068232e-05, "loss": 0.5239, "step": 565500 }, { "epoch": 1.6000633240608817, "grad_norm": 1.9121181964874268, "learning_rate": 2.3332277932318642e-05, "loss": 0.5282, "step": 566000 }, { "epoch": 1.6014768075627022, "grad_norm": 1.8232976198196411, "learning_rate": 2.3308719873954966e-05, "loss": 0.5325, "step": 566500 }, { "epoch": 1.6028902910645226, "grad_norm": 1.7509191036224365, "learning_rate": 2.328516181559129e-05, "loss": 0.5267, "step": 567000 }, { "epoch": 1.6043037745663433, "grad_norm": 1.7761201858520508, "learning_rate": 2.3261603757227613e-05, "loss": 0.5307, "step": 567500 }, { "epoch": 1.6057172580681638, "grad_norm": 1.7242580652236938, "learning_rate": 2.3238045698863937e-05, "loss": 0.5328, "step": 568000 }, { "epoch": 1.6071307415699843, "grad_norm": 1.802660346031189, "learning_rate": 2.321448764050026e-05, "loss": 0.5296, "step": 568500 }, { "epoch": 1.608544225071805, "grad_norm": 1.982885718345642, "learning_rate": 2.3190929582136588e-05, "loss": 0.5342, "step": 569000 }, { "epoch": 1.6099577085736256, "grad_norm": 1.98202645778656, "learning_rate": 2.316737152377291e-05, "loss": 0.5259, "step": 569500 }, { "epoch": 1.611371192075446, "grad_norm": 1.9092777967453003, "learning_rate": 2.3143813465409235e-05, "loss": 0.5266, "step": 570000 }, { "epoch": 1.6127846755772666, "grad_norm": 1.8502256870269775, "learning_rate": 2.312025540704556e-05, "loss": 0.5318, "step": 570500 }, { "epoch": 1.6141981590790873, "grad_norm": 1.8383904695510864, "learning_rate": 2.3096697348681882e-05, "loss": 0.5292, "step": 571000 }, { "epoch": 1.615611642580908, "grad_norm": 1.7982367277145386, "learning_rate": 2.3073139290318206e-05, "loss": 0.5314, "step": 571500 }, { "epoch": 1.6170251260827284, "grad_norm": 1.7563743591308594, "learning_rate": 2.304958123195453e-05, "loss": 0.5333, "step": 572000 }, { "epoch": 1.6184386095845489, "grad_norm": 1.8579233884811401, "learning_rate": 2.3026023173590853e-05, "loss": 0.5303, "step": 572500 }, { "epoch": 1.6198520930863696, "grad_norm": 1.8200680017471313, "learning_rate": 2.3002465115227177e-05, "loss": 0.5292, "step": 573000 }, { "epoch": 1.62126557658819, "grad_norm": 1.9027762413024902, "learning_rate": 2.29789070568635e-05, "loss": 0.5249, "step": 573500 }, { "epoch": 1.6226790600900105, "grad_norm": 1.906270146369934, "learning_rate": 2.2955348998499824e-05, "loss": 0.5298, "step": 574000 }, { "epoch": 1.6240925435918312, "grad_norm": 2.3781790733337402, "learning_rate": 2.2931790940136148e-05, "loss": 0.5294, "step": 574500 }, { "epoch": 1.6255060270936519, "grad_norm": 1.9472612142562866, "learning_rate": 2.290823288177247e-05, "loss": 0.531, "step": 575000 }, { "epoch": 1.6269195105954724, "grad_norm": 1.8460426330566406, "learning_rate": 2.2884674823408795e-05, "loss": 0.5243, "step": 575500 }, { "epoch": 1.6283329940972928, "grad_norm": 1.8433873653411865, "learning_rate": 2.2861116765045122e-05, "loss": 0.5304, "step": 576000 }, { "epoch": 1.6297464775991135, "grad_norm": 1.7698158025741577, "learning_rate": 2.2837558706681446e-05, "loss": 0.5278, "step": 576500 }, { "epoch": 1.631159961100934, "grad_norm": 1.8502919673919678, "learning_rate": 2.281400064831777e-05, "loss": 0.53, "step": 577000 }, { "epoch": 1.6325734446027544, "grad_norm": 1.9209575653076172, "learning_rate": 2.2790442589954093e-05, "loss": 0.5267, "step": 577500 }, { "epoch": 1.6339869281045751, "grad_norm": 1.8870681524276733, "learning_rate": 2.2766884531590417e-05, "loss": 0.5303, "step": 578000 }, { "epoch": 1.6354004116063958, "grad_norm": 2.2023983001708984, "learning_rate": 2.274332647322674e-05, "loss": 0.5287, "step": 578500 }, { "epoch": 1.6368138951082163, "grad_norm": 1.858723521232605, "learning_rate": 2.2719768414863064e-05, "loss": 0.5245, "step": 579000 }, { "epoch": 1.6382273786100368, "grad_norm": 1.9231594800949097, "learning_rate": 2.2696210356499388e-05, "loss": 0.532, "step": 579500 }, { "epoch": 1.6396408621118574, "grad_norm": 1.8209397792816162, "learning_rate": 2.2672652298135712e-05, "loss": 0.5255, "step": 580000 }, { "epoch": 1.6410543456136781, "grad_norm": 1.9256136417388916, "learning_rate": 2.2649094239772035e-05, "loss": 0.5293, "step": 580500 }, { "epoch": 1.6424678291154986, "grad_norm": 1.8478854894638062, "learning_rate": 2.262553618140836e-05, "loss": 0.5274, "step": 581000 }, { "epoch": 1.643881312617319, "grad_norm": 1.9126588106155396, "learning_rate": 2.2601978123044683e-05, "loss": 0.5291, "step": 581500 }, { "epoch": 1.6452947961191398, "grad_norm": 2.0643632411956787, "learning_rate": 2.2578420064681006e-05, "loss": 0.5292, "step": 582000 }, { "epoch": 1.6467082796209602, "grad_norm": 1.8195862770080566, "learning_rate": 2.255486200631733e-05, "loss": 0.5305, "step": 582500 }, { "epoch": 1.6481217631227807, "grad_norm": 1.9407540559768677, "learning_rate": 2.2531303947953654e-05, "loss": 0.5303, "step": 583000 }, { "epoch": 1.6495352466246014, "grad_norm": 1.8954795598983765, "learning_rate": 2.2507745889589978e-05, "loss": 0.5243, "step": 583500 }, { "epoch": 1.650948730126422, "grad_norm": 1.8740280866622925, "learning_rate": 2.24841878312263e-05, "loss": 0.5265, "step": 584000 }, { "epoch": 1.6523622136282425, "grad_norm": 2.0115342140197754, "learning_rate": 2.2460629772862625e-05, "loss": 0.5227, "step": 584500 }, { "epoch": 1.653775697130063, "grad_norm": 1.9906878471374512, "learning_rate": 2.243707171449895e-05, "loss": 0.5278, "step": 585000 }, { "epoch": 1.6551891806318837, "grad_norm": 1.8850200176239014, "learning_rate": 2.2413513656135272e-05, "loss": 0.5285, "step": 585500 }, { "epoch": 1.6566026641337044, "grad_norm": 2.003107786178589, "learning_rate": 2.2389955597771596e-05, "loss": 0.5265, "step": 586000 }, { "epoch": 1.6580161476355249, "grad_norm": 1.636924147605896, "learning_rate": 2.236639753940792e-05, "loss": 0.5295, "step": 586500 }, { "epoch": 1.6594296311373453, "grad_norm": 1.7246602773666382, "learning_rate": 2.2342839481044243e-05, "loss": 0.5256, "step": 587000 }, { "epoch": 1.660843114639166, "grad_norm": 1.8154232501983643, "learning_rate": 2.2319281422680567e-05, "loss": 0.5272, "step": 587500 }, { "epoch": 1.6622565981409865, "grad_norm": 1.9160915613174438, "learning_rate": 2.229572336431689e-05, "loss": 0.5238, "step": 588000 }, { "epoch": 1.663670081642807, "grad_norm": 1.7287582159042358, "learning_rate": 2.2272165305953214e-05, "loss": 0.5279, "step": 588500 }, { "epoch": 1.6650835651446276, "grad_norm": 2.0105509757995605, "learning_rate": 2.224860724758954e-05, "loss": 0.5206, "step": 589000 }, { "epoch": 1.6664970486464483, "grad_norm": 1.7233492136001587, "learning_rate": 2.2225049189225865e-05, "loss": 0.5295, "step": 589500 }, { "epoch": 1.6679105321482688, "grad_norm": 1.9100072383880615, "learning_rate": 2.220149113086219e-05, "loss": 0.5268, "step": 590000 }, { "epoch": 1.6693240156500893, "grad_norm": 1.9832935333251953, "learning_rate": 2.2177933072498512e-05, "loss": 0.5271, "step": 590500 }, { "epoch": 1.67073749915191, "grad_norm": 1.9794672727584839, "learning_rate": 2.2154375014134836e-05, "loss": 0.5295, "step": 591000 }, { "epoch": 1.6721509826537304, "grad_norm": 2.0939433574676514, "learning_rate": 2.213081695577116e-05, "loss": 0.5281, "step": 591500 }, { "epoch": 1.6735644661555509, "grad_norm": 1.7690047025680542, "learning_rate": 2.2107258897407483e-05, "loss": 0.5275, "step": 592000 }, { "epoch": 1.6749779496573716, "grad_norm": 1.8273756504058838, "learning_rate": 2.2083700839043807e-05, "loss": 0.5236, "step": 592500 }, { "epoch": 1.6763914331591923, "grad_norm": 1.8159074783325195, "learning_rate": 2.206014278068013e-05, "loss": 0.5279, "step": 593000 }, { "epoch": 1.6778049166610127, "grad_norm": 1.697007656097412, "learning_rate": 2.2036584722316454e-05, "loss": 0.5285, "step": 593500 }, { "epoch": 1.6792184001628332, "grad_norm": 1.701003074645996, "learning_rate": 2.2013026663952778e-05, "loss": 0.5275, "step": 594000 }, { "epoch": 1.6806318836646539, "grad_norm": 2.030367612838745, "learning_rate": 2.1989468605589102e-05, "loss": 0.5267, "step": 594500 }, { "epoch": 1.6820453671664746, "grad_norm": 2.00567364692688, "learning_rate": 2.1965910547225425e-05, "loss": 0.5247, "step": 595000 }, { "epoch": 1.683458850668295, "grad_norm": 2.037113904953003, "learning_rate": 2.194235248886175e-05, "loss": 0.5307, "step": 595500 }, { "epoch": 1.6848723341701155, "grad_norm": 1.8279660940170288, "learning_rate": 2.1918794430498073e-05, "loss": 0.5205, "step": 596000 }, { "epoch": 1.6862858176719362, "grad_norm": 1.759920358657837, "learning_rate": 2.1895236372134396e-05, "loss": 0.5283, "step": 596500 }, { "epoch": 1.6876993011737567, "grad_norm": 1.960317850112915, "learning_rate": 2.187167831377072e-05, "loss": 0.5276, "step": 597000 }, { "epoch": 1.6891127846755771, "grad_norm": 1.8253589868545532, "learning_rate": 2.1848120255407047e-05, "loss": 0.5266, "step": 597500 }, { "epoch": 1.6905262681773978, "grad_norm": 1.9644654989242554, "learning_rate": 2.182456219704337e-05, "loss": 0.5219, "step": 598000 }, { "epoch": 1.6919397516792185, "grad_norm": 2.0212717056274414, "learning_rate": 2.1801004138679695e-05, "loss": 0.5236, "step": 598500 }, { "epoch": 1.693353235181039, "grad_norm": 1.8058758974075317, "learning_rate": 2.1777446080316018e-05, "loss": 0.5238, "step": 599000 }, { "epoch": 1.6947667186828594, "grad_norm": 1.8834084272384644, "learning_rate": 2.1753888021952342e-05, "loss": 0.5239, "step": 599500 }, { "epoch": 1.6961802021846801, "grad_norm": 1.8444125652313232, "learning_rate": 2.1730329963588666e-05, "loss": 0.5271, "step": 600000 }, { "epoch": 1.6975936856865008, "grad_norm": 1.9355394840240479, "learning_rate": 2.170677190522499e-05, "loss": 0.5212, "step": 600500 }, { "epoch": 1.6990071691883213, "grad_norm": 1.8222063779830933, "learning_rate": 2.1683213846861313e-05, "loss": 0.5282, "step": 601000 }, { "epoch": 1.7004206526901418, "grad_norm": 1.8458892107009888, "learning_rate": 2.1659655788497637e-05, "loss": 0.5216, "step": 601500 }, { "epoch": 1.7018341361919624, "grad_norm": 2.0131001472473145, "learning_rate": 2.163609773013396e-05, "loss": 0.5248, "step": 602000 }, { "epoch": 1.703247619693783, "grad_norm": 1.882448673248291, "learning_rate": 2.1612539671770284e-05, "loss": 0.5263, "step": 602500 }, { "epoch": 1.7046611031956034, "grad_norm": 1.8758927583694458, "learning_rate": 2.1588981613406608e-05, "loss": 0.5244, "step": 603000 }, { "epoch": 1.706074586697424, "grad_norm": 1.620144009590149, "learning_rate": 2.156542355504293e-05, "loss": 0.524, "step": 603500 }, { "epoch": 1.7074880701992448, "grad_norm": 2.045086145401001, "learning_rate": 2.1541865496679255e-05, "loss": 0.525, "step": 604000 }, { "epoch": 1.7089015537010652, "grad_norm": 1.8403676748275757, "learning_rate": 2.151830743831558e-05, "loss": 0.5225, "step": 604500 }, { "epoch": 1.7103150372028857, "grad_norm": 1.9273518323898315, "learning_rate": 2.1494749379951906e-05, "loss": 0.5268, "step": 605000 }, { "epoch": 1.7117285207047064, "grad_norm": 1.885023832321167, "learning_rate": 2.147119132158823e-05, "loss": 0.5235, "step": 605500 }, { "epoch": 1.7131420042065268, "grad_norm": 1.8000057935714722, "learning_rate": 2.1447633263224553e-05, "loss": 0.5249, "step": 606000 }, { "epoch": 1.7145554877083473, "grad_norm": 2.0306034088134766, "learning_rate": 2.1424075204860877e-05, "loss": 0.527, "step": 606500 }, { "epoch": 1.715968971210168, "grad_norm": 1.7121691703796387, "learning_rate": 2.14005171464972e-05, "loss": 0.5241, "step": 607000 }, { "epoch": 1.7173824547119887, "grad_norm": 1.8435592651367188, "learning_rate": 2.1376959088133524e-05, "loss": 0.5237, "step": 607500 }, { "epoch": 1.7187959382138092, "grad_norm": 1.8696928024291992, "learning_rate": 2.1353401029769848e-05, "loss": 0.5284, "step": 608000 }, { "epoch": 1.7202094217156296, "grad_norm": 1.6849749088287354, "learning_rate": 2.132984297140617e-05, "loss": 0.5271, "step": 608500 }, { "epoch": 1.7216229052174503, "grad_norm": 1.8232489824295044, "learning_rate": 2.1306284913042495e-05, "loss": 0.5259, "step": 609000 }, { "epoch": 1.723036388719271, "grad_norm": 1.635512113571167, "learning_rate": 2.128272685467882e-05, "loss": 0.5225, "step": 609500 }, { "epoch": 1.7244498722210915, "grad_norm": 2.0718464851379395, "learning_rate": 2.1259168796315142e-05, "loss": 0.5207, "step": 610000 }, { "epoch": 1.725863355722912, "grad_norm": 1.6811671257019043, "learning_rate": 2.1235610737951466e-05, "loss": 0.5221, "step": 610500 }, { "epoch": 1.7272768392247326, "grad_norm": 1.7864670753479004, "learning_rate": 2.121205267958779e-05, "loss": 0.5204, "step": 611000 }, { "epoch": 1.728690322726553, "grad_norm": 1.9134594202041626, "learning_rate": 2.1188494621224113e-05, "loss": 0.5184, "step": 611500 }, { "epoch": 1.7301038062283736, "grad_norm": 1.936225414276123, "learning_rate": 2.116493656286044e-05, "loss": 0.5266, "step": 612000 }, { "epoch": 1.7315172897301943, "grad_norm": 1.7380188703536987, "learning_rate": 2.1141378504496764e-05, "loss": 0.5214, "step": 612500 }, { "epoch": 1.732930773232015, "grad_norm": 1.9957592487335205, "learning_rate": 2.1117820446133088e-05, "loss": 0.5282, "step": 613000 }, { "epoch": 1.7343442567338354, "grad_norm": 1.7405436038970947, "learning_rate": 2.109426238776941e-05, "loss": 0.5227, "step": 613500 }, { "epoch": 1.7357577402356559, "grad_norm": 1.835972547531128, "learning_rate": 2.1070704329405735e-05, "loss": 0.5234, "step": 614000 }, { "epoch": 1.7371712237374766, "grad_norm": 1.5600481033325195, "learning_rate": 2.104714627104206e-05, "loss": 0.5256, "step": 614500 }, { "epoch": 1.7385847072392973, "grad_norm": 1.6243133544921875, "learning_rate": 2.1023588212678383e-05, "loss": 0.5227, "step": 615000 }, { "epoch": 1.7399981907411177, "grad_norm": 1.869667410850525, "learning_rate": 2.1000030154314706e-05, "loss": 0.5243, "step": 615500 }, { "epoch": 1.7414116742429382, "grad_norm": 2.0219647884368896, "learning_rate": 2.097647209595103e-05, "loss": 0.5232, "step": 616000 }, { "epoch": 1.7428251577447589, "grad_norm": 1.8313305377960205, "learning_rate": 2.0952914037587354e-05, "loss": 0.5228, "step": 616500 }, { "epoch": 1.7442386412465793, "grad_norm": 1.9331393241882324, "learning_rate": 2.0929355979223677e-05, "loss": 0.5188, "step": 617000 }, { "epoch": 1.7456521247483998, "grad_norm": 1.7400931119918823, "learning_rate": 2.090579792086e-05, "loss": 0.519, "step": 617500 }, { "epoch": 1.7470656082502205, "grad_norm": 1.8199478387832642, "learning_rate": 2.0882239862496325e-05, "loss": 0.5228, "step": 618000 }, { "epoch": 1.7484790917520412, "grad_norm": 1.7065166234970093, "learning_rate": 2.0858681804132648e-05, "loss": 0.5211, "step": 618500 }, { "epoch": 1.7498925752538617, "grad_norm": 1.8836318254470825, "learning_rate": 2.0835123745768972e-05, "loss": 0.5212, "step": 619000 }, { "epoch": 1.7513060587556821, "grad_norm": 1.812211275100708, "learning_rate": 2.08115656874053e-05, "loss": 0.5279, "step": 619500 }, { "epoch": 1.7527195422575028, "grad_norm": 1.8233959674835205, "learning_rate": 2.0788007629041623e-05, "loss": 0.5211, "step": 620000 }, { "epoch": 1.7541330257593233, "grad_norm": 1.892747402191162, "learning_rate": 2.0764449570677946e-05, "loss": 0.5276, "step": 620500 }, { "epoch": 1.7555465092611438, "grad_norm": 1.8845887184143066, "learning_rate": 2.074089151231427e-05, "loss": 0.5207, "step": 621000 }, { "epoch": 1.7569599927629644, "grad_norm": 1.7125054597854614, "learning_rate": 2.0717333453950594e-05, "loss": 0.5236, "step": 621500 }, { "epoch": 1.7583734762647851, "grad_norm": 1.8812122344970703, "learning_rate": 2.0693775395586917e-05, "loss": 0.518, "step": 622000 }, { "epoch": 1.7597869597666056, "grad_norm": 1.8707724809646606, "learning_rate": 2.067021733722324e-05, "loss": 0.5205, "step": 622500 }, { "epoch": 1.761200443268426, "grad_norm": 1.7528687715530396, "learning_rate": 2.0646659278859565e-05, "loss": 0.5228, "step": 623000 }, { "epoch": 1.7626139267702468, "grad_norm": 1.8310612440109253, "learning_rate": 2.062310122049589e-05, "loss": 0.521, "step": 623500 }, { "epoch": 1.7640274102720674, "grad_norm": 1.8174140453338623, "learning_rate": 2.0599543162132212e-05, "loss": 0.517, "step": 624000 }, { "epoch": 1.765440893773888, "grad_norm": 1.9746261835098267, "learning_rate": 2.0575985103768536e-05, "loss": 0.5226, "step": 624500 }, { "epoch": 1.7668543772757084, "grad_norm": 1.7225979566574097, "learning_rate": 2.055242704540486e-05, "loss": 0.5197, "step": 625000 }, { "epoch": 1.768267860777529, "grad_norm": 1.7863434553146362, "learning_rate": 2.0528868987041183e-05, "loss": 0.5235, "step": 625500 }, { "epoch": 1.7696813442793495, "grad_norm": 1.8614521026611328, "learning_rate": 2.0505310928677507e-05, "loss": 0.523, "step": 626000 }, { "epoch": 1.77109482778117, "grad_norm": 2.036212205886841, "learning_rate": 2.0481752870313834e-05, "loss": 0.5198, "step": 626500 }, { "epoch": 1.7725083112829907, "grad_norm": 1.9951646327972412, "learning_rate": 2.0458194811950157e-05, "loss": 0.5196, "step": 627000 }, { "epoch": 1.7739217947848114, "grad_norm": 1.915747880935669, "learning_rate": 2.043463675358648e-05, "loss": 0.5217, "step": 627500 }, { "epoch": 1.7753352782866318, "grad_norm": 1.839570164680481, "learning_rate": 2.0411078695222805e-05, "loss": 0.5191, "step": 628000 }, { "epoch": 1.7767487617884523, "grad_norm": 1.9617575407028198, "learning_rate": 2.038752063685913e-05, "loss": 0.5178, "step": 628500 }, { "epoch": 1.778162245290273, "grad_norm": 1.839137077331543, "learning_rate": 2.0363962578495452e-05, "loss": 0.5175, "step": 629000 }, { "epoch": 1.7795757287920937, "grad_norm": 2.078562021255493, "learning_rate": 2.0340404520131776e-05, "loss": 0.5216, "step": 629500 }, { "epoch": 1.7809892122939142, "grad_norm": 1.9178249835968018, "learning_rate": 2.03168464617681e-05, "loss": 0.5226, "step": 630000 }, { "epoch": 1.7824026957957346, "grad_norm": 1.8646669387817383, "learning_rate": 2.0293288403404423e-05, "loss": 0.5209, "step": 630500 }, { "epoch": 1.7838161792975553, "grad_norm": 1.9033335447311401, "learning_rate": 2.0269730345040747e-05, "loss": 0.5226, "step": 631000 }, { "epoch": 1.7852296627993758, "grad_norm": 1.6369844675064087, "learning_rate": 2.024617228667707e-05, "loss": 0.5206, "step": 631500 }, { "epoch": 1.7866431463011962, "grad_norm": 1.9481797218322754, "learning_rate": 2.0222614228313394e-05, "loss": 0.5208, "step": 632000 }, { "epoch": 1.788056629803017, "grad_norm": 1.776780605316162, "learning_rate": 2.0199056169949718e-05, "loss": 0.5197, "step": 632500 }, { "epoch": 1.7894701133048376, "grad_norm": 1.7492467164993286, "learning_rate": 2.017549811158604e-05, "loss": 0.516, "step": 633000 }, { "epoch": 1.790883596806658, "grad_norm": 1.7395992279052734, "learning_rate": 2.0151940053222365e-05, "loss": 0.5144, "step": 633500 }, { "epoch": 1.7922970803084786, "grad_norm": 1.8977041244506836, "learning_rate": 2.0128381994858692e-05, "loss": 0.5168, "step": 634000 }, { "epoch": 1.7937105638102993, "grad_norm": 1.7062913179397583, "learning_rate": 2.0104823936495016e-05, "loss": 0.5212, "step": 634500 }, { "epoch": 1.7951240473121197, "grad_norm": 1.6778467893600464, "learning_rate": 2.008126587813134e-05, "loss": 0.519, "step": 635000 }, { "epoch": 1.7965375308139402, "grad_norm": 1.759216547012329, "learning_rate": 2.0057707819767663e-05, "loss": 0.5228, "step": 635500 }, { "epoch": 1.7979510143157609, "grad_norm": 2.0823416709899902, "learning_rate": 2.0034149761403987e-05, "loss": 0.5182, "step": 636000 }, { "epoch": 1.7993644978175816, "grad_norm": 1.8298286199569702, "learning_rate": 2.001059170304031e-05, "loss": 0.5176, "step": 636500 }, { "epoch": 1.800777981319402, "grad_norm": 1.9732640981674194, "learning_rate": 1.9987033644676634e-05, "loss": 0.5135, "step": 637000 }, { "epoch": 1.8021914648212225, "grad_norm": 1.9541376829147339, "learning_rate": 1.9963475586312958e-05, "loss": 0.5205, "step": 637500 }, { "epoch": 1.8036049483230432, "grad_norm": 1.91489577293396, "learning_rate": 1.9939917527949282e-05, "loss": 0.5179, "step": 638000 }, { "epoch": 1.8050184318248639, "grad_norm": 1.806699514389038, "learning_rate": 1.9916359469585605e-05, "loss": 0.5185, "step": 638500 }, { "epoch": 1.8064319153266843, "grad_norm": 1.6850334405899048, "learning_rate": 1.989280141122193e-05, "loss": 0.5167, "step": 639000 }, { "epoch": 1.8078453988285048, "grad_norm": 1.973204493522644, "learning_rate": 1.9869243352858253e-05, "loss": 0.5196, "step": 639500 }, { "epoch": 1.8092588823303255, "grad_norm": 1.8167065382003784, "learning_rate": 1.9845685294494576e-05, "loss": 0.5187, "step": 640000 }, { "epoch": 1.810672365832146, "grad_norm": 1.8875184059143066, "learning_rate": 1.98221272361309e-05, "loss": 0.5186, "step": 640500 }, { "epoch": 1.8120858493339664, "grad_norm": 2.038022994995117, "learning_rate": 1.9798569177767227e-05, "loss": 0.5145, "step": 641000 }, { "epoch": 1.8134993328357871, "grad_norm": 2.170346736907959, "learning_rate": 1.977501111940355e-05, "loss": 0.5125, "step": 641500 }, { "epoch": 1.8149128163376078, "grad_norm": 1.7908625602722168, "learning_rate": 1.9751453061039874e-05, "loss": 0.5214, "step": 642000 }, { "epoch": 1.8163262998394283, "grad_norm": 1.912365436553955, "learning_rate": 1.9727895002676198e-05, "loss": 0.5201, "step": 642500 }, { "epoch": 1.8177397833412487, "grad_norm": 2.141680955886841, "learning_rate": 1.9704336944312522e-05, "loss": 0.5199, "step": 643000 }, { "epoch": 1.8191532668430694, "grad_norm": 1.8249636888504028, "learning_rate": 1.9680778885948846e-05, "loss": 0.516, "step": 643500 }, { "epoch": 1.8205667503448901, "grad_norm": 1.9040132761001587, "learning_rate": 1.965722082758517e-05, "loss": 0.5171, "step": 644000 }, { "epoch": 1.8219802338467106, "grad_norm": 1.9884229898452759, "learning_rate": 1.9633662769221493e-05, "loss": 0.515, "step": 644500 }, { "epoch": 1.823393717348531, "grad_norm": 1.8702929019927979, "learning_rate": 1.9610104710857817e-05, "loss": 0.5206, "step": 645000 }, { "epoch": 1.8248072008503518, "grad_norm": 1.8179206848144531, "learning_rate": 1.958654665249414e-05, "loss": 0.5148, "step": 645500 }, { "epoch": 1.8262206843521722, "grad_norm": 1.7919546365737915, "learning_rate": 1.9562988594130464e-05, "loss": 0.5118, "step": 646000 }, { "epoch": 1.8276341678539927, "grad_norm": 1.8979545831680298, "learning_rate": 1.9539430535766788e-05, "loss": 0.5172, "step": 646500 }, { "epoch": 1.8290476513558134, "grad_norm": 1.8696070909500122, "learning_rate": 1.951587247740311e-05, "loss": 0.5171, "step": 647000 }, { "epoch": 1.830461134857634, "grad_norm": 1.7216817140579224, "learning_rate": 1.9492314419039435e-05, "loss": 0.5171, "step": 647500 }, { "epoch": 1.8318746183594545, "grad_norm": 1.9875612258911133, "learning_rate": 1.946875636067576e-05, "loss": 0.5173, "step": 648000 }, { "epoch": 1.833288101861275, "grad_norm": 1.926230549812317, "learning_rate": 1.9445198302312086e-05, "loss": 0.5155, "step": 648500 }, { "epoch": 1.8347015853630957, "grad_norm": 2.0745272636413574, "learning_rate": 1.942164024394841e-05, "loss": 0.5199, "step": 649000 }, { "epoch": 1.8361150688649162, "grad_norm": 1.9695639610290527, "learning_rate": 1.9398082185584733e-05, "loss": 0.5178, "step": 649500 }, { "epoch": 1.8375285523667366, "grad_norm": 1.7092125415802002, "learning_rate": 1.9374524127221057e-05, "loss": 0.5172, "step": 650000 }, { "epoch": 1.8389420358685573, "grad_norm": 1.9210418462753296, "learning_rate": 1.935096606885738e-05, "loss": 0.5146, "step": 650500 }, { "epoch": 1.840355519370378, "grad_norm": 1.956869125366211, "learning_rate": 1.9327408010493704e-05, "loss": 0.5182, "step": 651000 }, { "epoch": 1.8417690028721985, "grad_norm": 1.8627721071243286, "learning_rate": 1.9303849952130028e-05, "loss": 0.5138, "step": 651500 }, { "epoch": 1.843182486374019, "grad_norm": 1.830898642539978, "learning_rate": 1.928029189376635e-05, "loss": 0.5121, "step": 652000 }, { "epoch": 1.8445959698758396, "grad_norm": 1.890161156654358, "learning_rate": 1.9256733835402675e-05, "loss": 0.5166, "step": 652500 }, { "epoch": 1.8460094533776603, "grad_norm": 1.8892295360565186, "learning_rate": 1.9233175777039e-05, "loss": 0.5157, "step": 653000 }, { "epoch": 1.8474229368794808, "grad_norm": 1.9445174932479858, "learning_rate": 1.9209617718675322e-05, "loss": 0.517, "step": 653500 }, { "epoch": 1.8488364203813012, "grad_norm": 2.261033535003662, "learning_rate": 1.9186059660311646e-05, "loss": 0.5192, "step": 654000 }, { "epoch": 1.850249903883122, "grad_norm": 1.9059357643127441, "learning_rate": 1.916250160194797e-05, "loss": 0.5147, "step": 654500 }, { "epoch": 1.8516633873849424, "grad_norm": 1.7923301458358765, "learning_rate": 1.9138943543584293e-05, "loss": 0.5155, "step": 655000 }, { "epoch": 1.8530768708867629, "grad_norm": 1.8926290273666382, "learning_rate": 1.911538548522062e-05, "loss": 0.5165, "step": 655500 }, { "epoch": 1.8544903543885836, "grad_norm": 1.9066174030303955, "learning_rate": 1.9091827426856944e-05, "loss": 0.5151, "step": 656000 }, { "epoch": 1.8559038378904043, "grad_norm": 1.9111627340316772, "learning_rate": 1.9068269368493268e-05, "loss": 0.519, "step": 656500 }, { "epoch": 1.8573173213922247, "grad_norm": 1.9365427494049072, "learning_rate": 1.904471131012959e-05, "loss": 0.5134, "step": 657000 }, { "epoch": 1.8587308048940452, "grad_norm": 1.8084731101989746, "learning_rate": 1.9021153251765915e-05, "loss": 0.515, "step": 657500 }, { "epoch": 1.8601442883958659, "grad_norm": 1.9015299081802368, "learning_rate": 1.899759519340224e-05, "loss": 0.5155, "step": 658000 }, { "epoch": 1.8615577718976866, "grad_norm": 1.7965574264526367, "learning_rate": 1.8974037135038563e-05, "loss": 0.5101, "step": 658500 }, { "epoch": 1.862971255399507, "grad_norm": 1.9307502508163452, "learning_rate": 1.8950479076674886e-05, "loss": 0.5186, "step": 659000 }, { "epoch": 1.8643847389013275, "grad_norm": 1.8215925693511963, "learning_rate": 1.892692101831121e-05, "loss": 0.5193, "step": 659500 }, { "epoch": 1.8657982224031482, "grad_norm": 1.8499749898910522, "learning_rate": 1.8903362959947534e-05, "loss": 0.5149, "step": 660000 }, { "epoch": 1.8672117059049687, "grad_norm": 2.0501291751861572, "learning_rate": 1.8879804901583857e-05, "loss": 0.5137, "step": 660500 }, { "epoch": 1.8686251894067891, "grad_norm": 1.7520736455917358, "learning_rate": 1.885624684322018e-05, "loss": 0.5191, "step": 661000 }, { "epoch": 1.8700386729086098, "grad_norm": 1.8023104667663574, "learning_rate": 1.8832688784856505e-05, "loss": 0.5149, "step": 661500 }, { "epoch": 1.8714521564104305, "grad_norm": 1.9904227256774902, "learning_rate": 1.8809130726492828e-05, "loss": 0.5166, "step": 662000 }, { "epoch": 1.872865639912251, "grad_norm": 1.8475818634033203, "learning_rate": 1.8785572668129152e-05, "loss": 0.5181, "step": 662500 }, { "epoch": 1.8742791234140714, "grad_norm": 1.9907537698745728, "learning_rate": 1.8762014609765476e-05, "loss": 0.5202, "step": 663000 }, { "epoch": 1.8756926069158921, "grad_norm": 2.1697678565979004, "learning_rate": 1.87384565514018e-05, "loss": 0.515, "step": 663500 }, { "epoch": 1.8771060904177126, "grad_norm": 1.8073922395706177, "learning_rate": 1.8714898493038123e-05, "loss": 0.5133, "step": 664000 }, { "epoch": 1.878519573919533, "grad_norm": 1.9101812839508057, "learning_rate": 1.8691340434674447e-05, "loss": 0.5173, "step": 664500 }, { "epoch": 1.8799330574213537, "grad_norm": 1.9438273906707764, "learning_rate": 1.866778237631077e-05, "loss": 0.517, "step": 665000 }, { "epoch": 1.8813465409231744, "grad_norm": 1.8876426219940186, "learning_rate": 1.8644224317947094e-05, "loss": 0.5161, "step": 665500 }, { "epoch": 1.882760024424995, "grad_norm": 1.8120912313461304, "learning_rate": 1.8620666259583418e-05, "loss": 0.5141, "step": 666000 }, { "epoch": 1.8841735079268154, "grad_norm": 1.806697130203247, "learning_rate": 1.859710820121974e-05, "loss": 0.5123, "step": 666500 }, { "epoch": 1.885586991428636, "grad_norm": 2.0013318061828613, "learning_rate": 1.8573550142856065e-05, "loss": 0.5183, "step": 667000 }, { "epoch": 1.8870004749304568, "grad_norm": 1.8708502054214478, "learning_rate": 1.854999208449239e-05, "loss": 0.5137, "step": 667500 }, { "epoch": 1.8884139584322772, "grad_norm": 1.7306561470031738, "learning_rate": 1.8526434026128712e-05, "loss": 0.5197, "step": 668000 }, { "epoch": 1.8898274419340977, "grad_norm": 1.8186429738998413, "learning_rate": 1.8502875967765036e-05, "loss": 0.5171, "step": 668500 }, { "epoch": 1.8912409254359184, "grad_norm": 1.8592827320098877, "learning_rate": 1.847931790940136e-05, "loss": 0.5091, "step": 669000 }, { "epoch": 1.8926544089377388, "grad_norm": 1.9580775499343872, "learning_rate": 1.8455759851037683e-05, "loss": 0.5124, "step": 669500 }, { "epoch": 1.8940678924395593, "grad_norm": 2.018160820007324, "learning_rate": 1.843220179267401e-05, "loss": 0.5109, "step": 670000 }, { "epoch": 1.89548137594138, "grad_norm": 1.8487341403961182, "learning_rate": 1.8408643734310334e-05, "loss": 0.5103, "step": 670500 }, { "epoch": 1.8968948594432007, "grad_norm": 1.9873692989349365, "learning_rate": 1.8385085675946658e-05, "loss": 0.5164, "step": 671000 }, { "epoch": 1.8983083429450212, "grad_norm": 1.7633508443832397, "learning_rate": 1.836152761758298e-05, "loss": 0.5141, "step": 671500 }, { "epoch": 1.8997218264468416, "grad_norm": 1.900223970413208, "learning_rate": 1.8337969559219305e-05, "loss": 0.5112, "step": 672000 }, { "epoch": 1.9011353099486623, "grad_norm": 1.8788912296295166, "learning_rate": 1.831441150085563e-05, "loss": 0.5106, "step": 672500 }, { "epoch": 1.902548793450483, "grad_norm": 1.8103070259094238, "learning_rate": 1.8290853442491952e-05, "loss": 0.5122, "step": 673000 }, { "epoch": 1.9039622769523035, "grad_norm": 1.696487545967102, "learning_rate": 1.8267295384128276e-05, "loss": 0.5185, "step": 673500 }, { "epoch": 1.905375760454124, "grad_norm": 1.866716742515564, "learning_rate": 1.82437373257646e-05, "loss": 0.5122, "step": 674000 }, { "epoch": 1.9067892439559446, "grad_norm": 1.7520943880081177, "learning_rate": 1.8220179267400923e-05, "loss": 0.5161, "step": 674500 }, { "epoch": 1.908202727457765, "grad_norm": 1.7234801054000854, "learning_rate": 1.8196621209037247e-05, "loss": 0.5105, "step": 675000 }, { "epoch": 1.9096162109595856, "grad_norm": 1.7833620309829712, "learning_rate": 1.817306315067357e-05, "loss": 0.512, "step": 675500 }, { "epoch": 1.9110296944614062, "grad_norm": 2.0176827907562256, "learning_rate": 1.8149505092309895e-05, "loss": 0.5126, "step": 676000 }, { "epoch": 1.912443177963227, "grad_norm": 1.8365172147750854, "learning_rate": 1.8125947033946218e-05, "loss": 0.5111, "step": 676500 }, { "epoch": 1.9138566614650474, "grad_norm": 1.7630218267440796, "learning_rate": 1.8102388975582542e-05, "loss": 0.5119, "step": 677000 }, { "epoch": 1.9152701449668679, "grad_norm": 1.9338140487670898, "learning_rate": 1.807883091721887e-05, "loss": 0.514, "step": 677500 }, { "epoch": 1.9166836284686886, "grad_norm": 1.9412599802017212, "learning_rate": 1.8055272858855193e-05, "loss": 0.5125, "step": 678000 }, { "epoch": 1.918097111970509, "grad_norm": 1.9693238735198975, "learning_rate": 1.8031714800491516e-05, "loss": 0.5105, "step": 678500 }, { "epoch": 1.9195105954723295, "grad_norm": 1.8031787872314453, "learning_rate": 1.800815674212784e-05, "loss": 0.5139, "step": 679000 }, { "epoch": 1.9209240789741502, "grad_norm": 2.140437126159668, "learning_rate": 1.7984598683764164e-05, "loss": 0.5127, "step": 679500 }, { "epoch": 1.9223375624759709, "grad_norm": 1.8654301166534424, "learning_rate": 1.7961040625400487e-05, "loss": 0.5123, "step": 680000 }, { "epoch": 1.9237510459777913, "grad_norm": 1.9253193140029907, "learning_rate": 1.793748256703681e-05, "loss": 0.5133, "step": 680500 }, { "epoch": 1.9251645294796118, "grad_norm": 1.7545816898345947, "learning_rate": 1.7913924508673135e-05, "loss": 0.5128, "step": 681000 }, { "epoch": 1.9265780129814325, "grad_norm": 1.7978060245513916, "learning_rate": 1.789036645030946e-05, "loss": 0.51, "step": 681500 }, { "epoch": 1.9279914964832532, "grad_norm": 2.004182815551758, "learning_rate": 1.7866808391945782e-05, "loss": 0.5154, "step": 682000 }, { "epoch": 1.9294049799850737, "grad_norm": 1.9322947263717651, "learning_rate": 1.7843250333582106e-05, "loss": 0.5117, "step": 682500 }, { "epoch": 1.9308184634868941, "grad_norm": 2.036376714706421, "learning_rate": 1.781969227521843e-05, "loss": 0.5092, "step": 683000 }, { "epoch": 1.9322319469887148, "grad_norm": 1.8838669061660767, "learning_rate": 1.7796134216854753e-05, "loss": 0.5119, "step": 683500 }, { "epoch": 1.9336454304905353, "grad_norm": 2.156785488128662, "learning_rate": 1.7772576158491077e-05, "loss": 0.5111, "step": 684000 }, { "epoch": 1.9350589139923557, "grad_norm": 1.8784955739974976, "learning_rate": 1.7749018100127404e-05, "loss": 0.5182, "step": 684500 }, { "epoch": 1.9364723974941764, "grad_norm": 1.896589756011963, "learning_rate": 1.7725460041763727e-05, "loss": 0.5131, "step": 685000 }, { "epoch": 1.9378858809959971, "grad_norm": 1.699276328086853, "learning_rate": 1.770190198340005e-05, "loss": 0.5099, "step": 685500 }, { "epoch": 1.9392993644978176, "grad_norm": 1.9924253225326538, "learning_rate": 1.7678343925036375e-05, "loss": 0.5125, "step": 686000 }, { "epoch": 1.940712847999638, "grad_norm": 1.8573499917984009, "learning_rate": 1.76547858666727e-05, "loss": 0.5105, "step": 686500 }, { "epoch": 1.9421263315014587, "grad_norm": 1.9433872699737549, "learning_rate": 1.7631227808309022e-05, "loss": 0.5078, "step": 687000 }, { "epoch": 1.9435398150032794, "grad_norm": 1.9593645334243774, "learning_rate": 1.7607669749945346e-05, "loss": 0.5117, "step": 687500 }, { "epoch": 1.9449532985051, "grad_norm": 1.8675421476364136, "learning_rate": 1.758411169158167e-05, "loss": 0.5101, "step": 688000 }, { "epoch": 1.9463667820069204, "grad_norm": 1.9145921468734741, "learning_rate": 1.7560553633217993e-05, "loss": 0.5077, "step": 688500 }, { "epoch": 1.947780265508741, "grad_norm": 1.7996569871902466, "learning_rate": 1.7536995574854317e-05, "loss": 0.5143, "step": 689000 }, { "epoch": 1.9491937490105615, "grad_norm": 1.896638035774231, "learning_rate": 1.751343751649064e-05, "loss": 0.5118, "step": 689500 }, { "epoch": 1.950607232512382, "grad_norm": 1.8960877656936646, "learning_rate": 1.7489879458126964e-05, "loss": 0.5129, "step": 690000 }, { "epoch": 1.9520207160142027, "grad_norm": 1.9848734140396118, "learning_rate": 1.7466321399763288e-05, "loss": 0.513, "step": 690500 }, { "epoch": 1.9534341995160234, "grad_norm": 1.9179186820983887, "learning_rate": 1.744276334139961e-05, "loss": 0.5047, "step": 691000 }, { "epoch": 1.9548476830178438, "grad_norm": 1.8694814443588257, "learning_rate": 1.7419205283035935e-05, "loss": 0.5099, "step": 691500 }, { "epoch": 1.9562611665196643, "grad_norm": 1.8542132377624512, "learning_rate": 1.7395647224672262e-05, "loss": 0.5129, "step": 692000 }, { "epoch": 1.957674650021485, "grad_norm": 2.1884853839874268, "learning_rate": 1.7372089166308586e-05, "loss": 0.5121, "step": 692500 }, { "epoch": 1.9590881335233055, "grad_norm": 1.8633896112442017, "learning_rate": 1.734853110794491e-05, "loss": 0.5107, "step": 693000 }, { "epoch": 1.960501617025126, "grad_norm": 1.8736402988433838, "learning_rate": 1.7324973049581233e-05, "loss": 0.5102, "step": 693500 }, { "epoch": 1.9619151005269466, "grad_norm": 1.8055630922317505, "learning_rate": 1.7301414991217557e-05, "loss": 0.5154, "step": 694000 }, { "epoch": 1.9633285840287673, "grad_norm": 1.6645179986953735, "learning_rate": 1.727785693285388e-05, "loss": 0.5088, "step": 694500 }, { "epoch": 1.9647420675305878, "grad_norm": 1.725305438041687, "learning_rate": 1.7254298874490204e-05, "loss": 0.5037, "step": 695000 }, { "epoch": 1.9661555510324082, "grad_norm": 2.0037930011749268, "learning_rate": 1.7230740816126528e-05, "loss": 0.5099, "step": 695500 }, { "epoch": 1.967569034534229, "grad_norm": 2.0087485313415527, "learning_rate": 1.720718275776285e-05, "loss": 0.5086, "step": 696000 }, { "epoch": 1.9689825180360496, "grad_norm": 1.8879730701446533, "learning_rate": 1.7183624699399175e-05, "loss": 0.5076, "step": 696500 }, { "epoch": 1.97039600153787, "grad_norm": 1.945778489112854, "learning_rate": 1.71600666410355e-05, "loss": 0.509, "step": 697000 }, { "epoch": 1.9718094850396906, "grad_norm": 1.895570993423462, "learning_rate": 1.7136508582671823e-05, "loss": 0.5101, "step": 697500 }, { "epoch": 1.9732229685415112, "grad_norm": 1.845316767692566, "learning_rate": 1.7112950524308146e-05, "loss": 0.5088, "step": 698000 }, { "epoch": 1.9746364520433317, "grad_norm": 1.8677914142608643, "learning_rate": 1.708939246594447e-05, "loss": 0.5102, "step": 698500 }, { "epoch": 1.9760499355451522, "grad_norm": 2.156189441680908, "learning_rate": 1.7065834407580797e-05, "loss": 0.5025, "step": 699000 }, { "epoch": 1.9774634190469729, "grad_norm": 1.8156108856201172, "learning_rate": 1.704227634921712e-05, "loss": 0.5046, "step": 699500 }, { "epoch": 1.9788769025487936, "grad_norm": 1.8314679861068726, "learning_rate": 1.7018718290853444e-05, "loss": 0.5129, "step": 700000 }, { "epoch": 1.980290386050614, "grad_norm": 2.0571937561035156, "learning_rate": 1.6995160232489768e-05, "loss": 0.5109, "step": 700500 }, { "epoch": 1.9817038695524345, "grad_norm": 2.2031638622283936, "learning_rate": 1.6971602174126092e-05, "loss": 0.5142, "step": 701000 }, { "epoch": 1.9831173530542552, "grad_norm": 2.1552109718322754, "learning_rate": 1.6948044115762415e-05, "loss": 0.5048, "step": 701500 }, { "epoch": 1.9845308365560759, "grad_norm": 1.8365544080734253, "learning_rate": 1.692448605739874e-05, "loss": 0.5095, "step": 702000 }, { "epoch": 1.9859443200578963, "grad_norm": 1.9672375917434692, "learning_rate": 1.6900927999035063e-05, "loss": 0.5074, "step": 702500 }, { "epoch": 1.9873578035597168, "grad_norm": 1.7463129758834839, "learning_rate": 1.6877369940671386e-05, "loss": 0.5115, "step": 703000 }, { "epoch": 1.9887712870615375, "grad_norm": 2.02946138381958, "learning_rate": 1.685381188230771e-05, "loss": 0.511, "step": 703500 }, { "epoch": 1.990184770563358, "grad_norm": 1.9783172607421875, "learning_rate": 1.6830253823944034e-05, "loss": 0.5091, "step": 704000 }, { "epoch": 1.9915982540651784, "grad_norm": 1.9631342887878418, "learning_rate": 1.6806695765580357e-05, "loss": 0.5111, "step": 704500 }, { "epoch": 1.9930117375669991, "grad_norm": 2.007530689239502, "learning_rate": 1.678313770721668e-05, "loss": 0.5112, "step": 705000 }, { "epoch": 1.9944252210688198, "grad_norm": 2.036952257156372, "learning_rate": 1.6759579648853005e-05, "loss": 0.5075, "step": 705500 }, { "epoch": 1.9958387045706403, "grad_norm": 1.8756660223007202, "learning_rate": 1.673602159048933e-05, "loss": 0.5099, "step": 706000 }, { "epoch": 1.9972521880724607, "grad_norm": 1.9247711896896362, "learning_rate": 1.6712463532125656e-05, "loss": 0.5056, "step": 706500 }, { "epoch": 1.9986656715742814, "grad_norm": 1.8311184644699097, "learning_rate": 1.668890547376198e-05, "loss": 0.507, "step": 707000 }, { "epoch": 2.000079155076102, "grad_norm": 1.7567567825317383, "learning_rate": 1.6665347415398303e-05, "loss": 0.5086, "step": 707500 }, { "epoch": 2.0014926385779224, "grad_norm": 2.1177659034729004, "learning_rate": 1.6641789357034627e-05, "loss": 0.5086, "step": 708000 }, { "epoch": 2.002906122079743, "grad_norm": 2.1270751953125, "learning_rate": 1.661823129867095e-05, "loss": 0.5096, "step": 708500 }, { "epoch": 2.0043196055815637, "grad_norm": 2.0122597217559814, "learning_rate": 1.6594673240307274e-05, "loss": 0.5079, "step": 709000 }, { "epoch": 2.0057330890833844, "grad_norm": 1.8594169616699219, "learning_rate": 1.6571115181943598e-05, "loss": 0.5077, "step": 709500 }, { "epoch": 2.0071465725852047, "grad_norm": 1.9252173900604248, "learning_rate": 1.654755712357992e-05, "loss": 0.5061, "step": 710000 }, { "epoch": 2.0085600560870254, "grad_norm": 1.9239895343780518, "learning_rate": 1.6523999065216245e-05, "loss": 0.5105, "step": 710500 }, { "epoch": 2.009973539588846, "grad_norm": 1.8859412670135498, "learning_rate": 1.650044100685257e-05, "loss": 0.5109, "step": 711000 }, { "epoch": 2.0113870230906663, "grad_norm": 1.7316310405731201, "learning_rate": 1.6476882948488892e-05, "loss": 0.5051, "step": 711500 }, { "epoch": 2.012800506592487, "grad_norm": 1.7745323181152344, "learning_rate": 1.6453324890125216e-05, "loss": 0.5086, "step": 712000 }, { "epoch": 2.0142139900943077, "grad_norm": 1.7576372623443604, "learning_rate": 1.642976683176154e-05, "loss": 0.5089, "step": 712500 }, { "epoch": 2.0156274735961284, "grad_norm": 2.2024598121643066, "learning_rate": 1.6406208773397863e-05, "loss": 0.5069, "step": 713000 }, { "epoch": 2.0170409570979486, "grad_norm": 1.9023475646972656, "learning_rate": 1.638265071503419e-05, "loss": 0.5071, "step": 713500 }, { "epoch": 2.0184544405997693, "grad_norm": 2.133826494216919, "learning_rate": 1.6359092656670514e-05, "loss": 0.5054, "step": 714000 }, { "epoch": 2.01986792410159, "grad_norm": 1.9100439548492432, "learning_rate": 1.6335534598306838e-05, "loss": 0.505, "step": 714500 }, { "epoch": 2.0212814076034102, "grad_norm": 1.7768245935440063, "learning_rate": 1.631197653994316e-05, "loss": 0.5041, "step": 715000 }, { "epoch": 2.022694891105231, "grad_norm": 1.8135194778442383, "learning_rate": 1.6288418481579485e-05, "loss": 0.5039, "step": 715500 }, { "epoch": 2.0241083746070516, "grad_norm": 1.8838341236114502, "learning_rate": 1.626486042321581e-05, "loss": 0.5046, "step": 716000 }, { "epoch": 2.0255218581088723, "grad_norm": 1.8504492044448853, "learning_rate": 1.6241302364852132e-05, "loss": 0.5044, "step": 716500 }, { "epoch": 2.0269353416106926, "grad_norm": 1.930123209953308, "learning_rate": 1.6217744306488456e-05, "loss": 0.5073, "step": 717000 }, { "epoch": 2.0283488251125132, "grad_norm": 2.0734145641326904, "learning_rate": 1.619418624812478e-05, "loss": 0.5069, "step": 717500 }, { "epoch": 2.029762308614334, "grad_norm": 1.740331768989563, "learning_rate": 1.6170628189761103e-05, "loss": 0.5081, "step": 718000 }, { "epoch": 2.0311757921161546, "grad_norm": 1.8909633159637451, "learning_rate": 1.6147070131397427e-05, "loss": 0.4987, "step": 718500 }, { "epoch": 2.032589275617975, "grad_norm": 1.8603860139846802, "learning_rate": 1.612351207303375e-05, "loss": 0.5104, "step": 719000 }, { "epoch": 2.0340027591197956, "grad_norm": 1.8285483121871948, "learning_rate": 1.6099954014670074e-05, "loss": 0.5067, "step": 719500 }, { "epoch": 2.0354162426216162, "grad_norm": 1.8741170167922974, "learning_rate": 1.6076395956306398e-05, "loss": 0.506, "step": 720000 }, { "epoch": 2.0368297261234365, "grad_norm": 1.9967536926269531, "learning_rate": 1.6052837897942722e-05, "loss": 0.5058, "step": 720500 }, { "epoch": 2.038243209625257, "grad_norm": 1.9384851455688477, "learning_rate": 1.602927983957905e-05, "loss": 0.5093, "step": 721000 }, { "epoch": 2.039656693127078, "grad_norm": 1.8738670349121094, "learning_rate": 1.6005721781215373e-05, "loss": 0.5023, "step": 721500 }, { "epoch": 2.0410701766288986, "grad_norm": 1.8871054649353027, "learning_rate": 1.5982163722851696e-05, "loss": 0.5098, "step": 722000 }, { "epoch": 2.042483660130719, "grad_norm": 1.918533444404602, "learning_rate": 1.595860566448802e-05, "loss": 0.4991, "step": 722500 }, { "epoch": 2.0438971436325395, "grad_norm": 1.7173200845718384, "learning_rate": 1.5935047606124344e-05, "loss": 0.5048, "step": 723000 }, { "epoch": 2.04531062713436, "grad_norm": 1.810490608215332, "learning_rate": 1.5911489547760667e-05, "loss": 0.5074, "step": 723500 }, { "epoch": 2.046724110636181, "grad_norm": 1.9068644046783447, "learning_rate": 1.588793148939699e-05, "loss": 0.5041, "step": 724000 }, { "epoch": 2.048137594138001, "grad_norm": 1.8920402526855469, "learning_rate": 1.5864373431033315e-05, "loss": 0.5056, "step": 724500 }, { "epoch": 2.049551077639822, "grad_norm": 1.819712519645691, "learning_rate": 1.5840815372669638e-05, "loss": 0.5028, "step": 725000 }, { "epoch": 2.0509645611416425, "grad_norm": 1.938735842704773, "learning_rate": 1.5817257314305962e-05, "loss": 0.5026, "step": 725500 }, { "epoch": 2.0523780446434627, "grad_norm": 1.9647095203399658, "learning_rate": 1.5793699255942286e-05, "loss": 0.5002, "step": 726000 }, { "epoch": 2.0537915281452834, "grad_norm": 1.7423869371414185, "learning_rate": 1.577014119757861e-05, "loss": 0.5041, "step": 726500 }, { "epoch": 2.055205011647104, "grad_norm": 1.6321732997894287, "learning_rate": 1.5746583139214933e-05, "loss": 0.5046, "step": 727000 }, { "epoch": 2.056618495148925, "grad_norm": 1.8203030824661255, "learning_rate": 1.5723025080851257e-05, "loss": 0.5033, "step": 727500 }, { "epoch": 2.058031978650745, "grad_norm": 1.9798448085784912, "learning_rate": 1.5699467022487584e-05, "loss": 0.5022, "step": 728000 }, { "epoch": 2.0594454621525657, "grad_norm": 1.8696880340576172, "learning_rate": 1.5675908964123907e-05, "loss": 0.5058, "step": 728500 }, { "epoch": 2.0608589456543864, "grad_norm": 2.044553518295288, "learning_rate": 1.565235090576023e-05, "loss": 0.509, "step": 729000 }, { "epoch": 2.0622724291562067, "grad_norm": 1.8484811782836914, "learning_rate": 1.5628792847396555e-05, "loss": 0.5058, "step": 729500 }, { "epoch": 2.0636859126580274, "grad_norm": 1.8480814695358276, "learning_rate": 1.560523478903288e-05, "loss": 0.5071, "step": 730000 }, { "epoch": 2.065099396159848, "grad_norm": 1.9192343950271606, "learning_rate": 1.5581676730669202e-05, "loss": 0.5052, "step": 730500 }, { "epoch": 2.0665128796616687, "grad_norm": 1.9837762117385864, "learning_rate": 1.5558118672305526e-05, "loss": 0.5053, "step": 731000 }, { "epoch": 2.067926363163489, "grad_norm": 1.9514172077178955, "learning_rate": 1.553456061394185e-05, "loss": 0.507, "step": 731500 }, { "epoch": 2.0693398466653097, "grad_norm": 1.8313169479370117, "learning_rate": 1.5511002555578173e-05, "loss": 0.5042, "step": 732000 }, { "epoch": 2.0707533301671304, "grad_norm": 1.8846704959869385, "learning_rate": 1.5487444497214497e-05, "loss": 0.5077, "step": 732500 }, { "epoch": 2.072166813668951, "grad_norm": 1.8080190420150757, "learning_rate": 1.546388643885082e-05, "loss": 0.5073, "step": 733000 }, { "epoch": 2.0735802971707713, "grad_norm": 1.886803150177002, "learning_rate": 1.5440328380487144e-05, "loss": 0.5046, "step": 733500 }, { "epoch": 2.074993780672592, "grad_norm": 1.9107117652893066, "learning_rate": 1.5416770322123468e-05, "loss": 0.5068, "step": 734000 }, { "epoch": 2.0764072641744127, "grad_norm": 1.879349708557129, "learning_rate": 1.539321226375979e-05, "loss": 0.5026, "step": 734500 }, { "epoch": 2.077820747676233, "grad_norm": 1.7760926485061646, "learning_rate": 1.5369654205396115e-05, "loss": 0.5028, "step": 735000 }, { "epoch": 2.0792342311780536, "grad_norm": 2.144963264465332, "learning_rate": 1.534609614703244e-05, "loss": 0.5039, "step": 735500 }, { "epoch": 2.0806477146798743, "grad_norm": 2.0177903175354004, "learning_rate": 1.5322538088668763e-05, "loss": 0.5044, "step": 736000 }, { "epoch": 2.082061198181695, "grad_norm": 1.8800079822540283, "learning_rate": 1.5298980030305086e-05, "loss": 0.5012, "step": 736500 }, { "epoch": 2.0834746816835152, "grad_norm": 1.9640705585479736, "learning_rate": 1.5275421971941413e-05, "loss": 0.5018, "step": 737000 }, { "epoch": 2.084888165185336, "grad_norm": 2.000149965286255, "learning_rate": 1.5251863913577735e-05, "loss": 0.5071, "step": 737500 }, { "epoch": 2.0863016486871566, "grad_norm": 2.007354974746704, "learning_rate": 1.5228305855214059e-05, "loss": 0.5084, "step": 738000 }, { "epoch": 2.0877151321889773, "grad_norm": 1.9068665504455566, "learning_rate": 1.5204747796850383e-05, "loss": 0.5017, "step": 738500 }, { "epoch": 2.0891286156907976, "grad_norm": 2.0678930282592773, "learning_rate": 1.5181189738486706e-05, "loss": 0.5037, "step": 739000 }, { "epoch": 2.0905420991926182, "grad_norm": 1.8299510478973389, "learning_rate": 1.515763168012303e-05, "loss": 0.5038, "step": 739500 }, { "epoch": 2.091955582694439, "grad_norm": 1.8137887716293335, "learning_rate": 1.5134073621759354e-05, "loss": 0.5047, "step": 740000 }, { "epoch": 2.093369066196259, "grad_norm": 1.8272340297698975, "learning_rate": 1.5110515563395677e-05, "loss": 0.5035, "step": 740500 }, { "epoch": 2.09478254969808, "grad_norm": 1.8615729808807373, "learning_rate": 1.5086957505032001e-05, "loss": 0.5002, "step": 741000 }, { "epoch": 2.0961960331999006, "grad_norm": 1.9539703130722046, "learning_rate": 1.5063399446668325e-05, "loss": 0.5038, "step": 741500 }, { "epoch": 2.0976095167017212, "grad_norm": 1.7620818614959717, "learning_rate": 1.5039841388304648e-05, "loss": 0.5017, "step": 742000 }, { "epoch": 2.0990230002035415, "grad_norm": 1.6797075271606445, "learning_rate": 1.5016283329940975e-05, "loss": 0.5003, "step": 742500 }, { "epoch": 2.100436483705362, "grad_norm": 2.07619309425354, "learning_rate": 1.4992725271577299e-05, "loss": 0.5005, "step": 743000 }, { "epoch": 2.101849967207183, "grad_norm": 1.83912193775177, "learning_rate": 1.4969167213213623e-05, "loss": 0.5, "step": 743500 }, { "epoch": 2.103263450709003, "grad_norm": 1.9473897218704224, "learning_rate": 1.4945609154849946e-05, "loss": 0.5002, "step": 744000 }, { "epoch": 2.104676934210824, "grad_norm": 1.843422770500183, "learning_rate": 1.492205109648627e-05, "loss": 0.5061, "step": 744500 }, { "epoch": 2.1060904177126445, "grad_norm": 1.9645881652832031, "learning_rate": 1.4898493038122594e-05, "loss": 0.5051, "step": 745000 }, { "epoch": 2.107503901214465, "grad_norm": 1.9535120725631714, "learning_rate": 1.4874934979758917e-05, "loss": 0.505, "step": 745500 }, { "epoch": 2.1089173847162854, "grad_norm": 1.9835803508758545, "learning_rate": 1.4851376921395241e-05, "loss": 0.5021, "step": 746000 }, { "epoch": 2.110330868218106, "grad_norm": 1.7907167673110962, "learning_rate": 1.4827818863031565e-05, "loss": 0.5059, "step": 746500 }, { "epoch": 2.111744351719927, "grad_norm": 2.013474702835083, "learning_rate": 1.4804260804667888e-05, "loss": 0.4982, "step": 747000 }, { "epoch": 2.1131578352217475, "grad_norm": 1.9045283794403076, "learning_rate": 1.4780702746304212e-05, "loss": 0.5031, "step": 747500 }, { "epoch": 2.1145713187235677, "grad_norm": 2.1042094230651855, "learning_rate": 1.4757144687940536e-05, "loss": 0.5028, "step": 748000 }, { "epoch": 2.1159848022253884, "grad_norm": 1.781448245048523, "learning_rate": 1.473358662957686e-05, "loss": 0.5016, "step": 748500 }, { "epoch": 2.117398285727209, "grad_norm": 1.7052689790725708, "learning_rate": 1.4710028571213183e-05, "loss": 0.5036, "step": 749000 }, { "epoch": 2.1188117692290294, "grad_norm": 1.9337204694747925, "learning_rate": 1.4686470512849507e-05, "loss": 0.4989, "step": 749500 }, { "epoch": 2.12022525273085, "grad_norm": 1.8596725463867188, "learning_rate": 1.4662912454485834e-05, "loss": 0.4996, "step": 750000 }, { "epoch": 2.1216387362326707, "grad_norm": 1.8754959106445312, "learning_rate": 1.4639354396122158e-05, "loss": 0.4998, "step": 750500 }, { "epoch": 2.1230522197344914, "grad_norm": 1.9352794885635376, "learning_rate": 1.4615796337758481e-05, "loss": 0.5021, "step": 751000 }, { "epoch": 2.1244657032363117, "grad_norm": 1.9103277921676636, "learning_rate": 1.4592238279394805e-05, "loss": 0.5003, "step": 751500 }, { "epoch": 2.1258791867381324, "grad_norm": 1.9889451265335083, "learning_rate": 1.4568680221031129e-05, "loss": 0.5018, "step": 752000 }, { "epoch": 2.127292670239953, "grad_norm": 1.7815110683441162, "learning_rate": 1.4545122162667452e-05, "loss": 0.4997, "step": 752500 }, { "epoch": 2.1287061537417733, "grad_norm": 2.09352445602417, "learning_rate": 1.4521564104303776e-05, "loss": 0.5021, "step": 753000 }, { "epoch": 2.130119637243594, "grad_norm": 1.884161353111267, "learning_rate": 1.44980060459401e-05, "loss": 0.4986, "step": 753500 }, { "epoch": 2.1315331207454147, "grad_norm": 2.110827684402466, "learning_rate": 1.4474447987576423e-05, "loss": 0.5038, "step": 754000 }, { "epoch": 2.1329466042472354, "grad_norm": 1.9243890047073364, "learning_rate": 1.4450889929212747e-05, "loss": 0.5016, "step": 754500 }, { "epoch": 2.1343600877490556, "grad_norm": 1.9544833898544312, "learning_rate": 1.442733187084907e-05, "loss": 0.5016, "step": 755000 }, { "epoch": 2.1357735712508763, "grad_norm": 1.8541886806488037, "learning_rate": 1.4403773812485394e-05, "loss": 0.4985, "step": 755500 }, { "epoch": 2.137187054752697, "grad_norm": 1.8089663982391357, "learning_rate": 1.4380215754121718e-05, "loss": 0.5035, "step": 756000 }, { "epoch": 2.1386005382545177, "grad_norm": 1.933390736579895, "learning_rate": 1.4356657695758042e-05, "loss": 0.5029, "step": 756500 }, { "epoch": 2.140014021756338, "grad_norm": 1.7185612916946411, "learning_rate": 1.4333099637394367e-05, "loss": 0.5013, "step": 757000 }, { "epoch": 2.1414275052581586, "grad_norm": 1.8614178895950317, "learning_rate": 1.430954157903069e-05, "loss": 0.4998, "step": 757500 }, { "epoch": 2.1428409887599793, "grad_norm": 1.887507677078247, "learning_rate": 1.4285983520667014e-05, "loss": 0.5002, "step": 758000 }, { "epoch": 2.1442544722617995, "grad_norm": 2.0084047317504883, "learning_rate": 1.4262425462303338e-05, "loss": 0.5032, "step": 758500 }, { "epoch": 2.1456679557636202, "grad_norm": 1.9620707035064697, "learning_rate": 1.4238867403939662e-05, "loss": 0.5014, "step": 759000 }, { "epoch": 2.147081439265441, "grad_norm": 1.7401902675628662, "learning_rate": 1.4215309345575985e-05, "loss": 0.4962, "step": 759500 }, { "epoch": 2.1484949227672616, "grad_norm": 1.8588811159133911, "learning_rate": 1.4191751287212309e-05, "loss": 0.5055, "step": 760000 }, { "epoch": 2.149908406269082, "grad_norm": 1.7739200592041016, "learning_rate": 1.4168193228848633e-05, "loss": 0.497, "step": 760500 }, { "epoch": 2.1513218897709026, "grad_norm": 1.9261047840118408, "learning_rate": 1.4144635170484956e-05, "loss": 0.4974, "step": 761000 }, { "epoch": 2.1527353732727232, "grad_norm": 1.9217162132263184, "learning_rate": 1.4121077112121282e-05, "loss": 0.5003, "step": 761500 }, { "epoch": 2.154148856774544, "grad_norm": 1.8363077640533447, "learning_rate": 1.4097519053757605e-05, "loss": 0.5037, "step": 762000 }, { "epoch": 2.155562340276364, "grad_norm": 1.7070077657699585, "learning_rate": 1.4073960995393929e-05, "loss": 0.5003, "step": 762500 }, { "epoch": 2.156975823778185, "grad_norm": 1.9000979661941528, "learning_rate": 1.4050402937030253e-05, "loss": 0.4984, "step": 763000 }, { "epoch": 2.1583893072800056, "grad_norm": 1.9944567680358887, "learning_rate": 1.4026844878666576e-05, "loss": 0.4984, "step": 763500 }, { "epoch": 2.159802790781826, "grad_norm": 1.831699252128601, "learning_rate": 1.40032868203029e-05, "loss": 0.4989, "step": 764000 }, { "epoch": 2.1612162742836465, "grad_norm": 1.812870740890503, "learning_rate": 1.3979728761939225e-05, "loss": 0.5033, "step": 764500 }, { "epoch": 2.162629757785467, "grad_norm": 1.9506762027740479, "learning_rate": 1.395617070357555e-05, "loss": 0.499, "step": 765000 }, { "epoch": 2.164043241287288, "grad_norm": 2.340280055999756, "learning_rate": 1.3932612645211873e-05, "loss": 0.5051, "step": 765500 }, { "epoch": 2.165456724789108, "grad_norm": 1.8961091041564941, "learning_rate": 1.3909054586848197e-05, "loss": 0.4967, "step": 766000 }, { "epoch": 2.166870208290929, "grad_norm": 1.875096082687378, "learning_rate": 1.388549652848452e-05, "loss": 0.4982, "step": 766500 }, { "epoch": 2.1682836917927495, "grad_norm": 1.928933024406433, "learning_rate": 1.3861938470120844e-05, "loss": 0.4988, "step": 767000 }, { "epoch": 2.16969717529457, "grad_norm": 1.7868136167526245, "learning_rate": 1.3838380411757168e-05, "loss": 0.4989, "step": 767500 }, { "epoch": 2.1711106587963904, "grad_norm": 1.836256504058838, "learning_rate": 1.3814822353393491e-05, "loss": 0.5025, "step": 768000 }, { "epoch": 2.172524142298211, "grad_norm": 1.862635612487793, "learning_rate": 1.3791264295029815e-05, "loss": 0.5025, "step": 768500 }, { "epoch": 2.173937625800032, "grad_norm": 1.8089096546173096, "learning_rate": 1.3767706236666139e-05, "loss": 0.5003, "step": 769000 }, { "epoch": 2.175351109301852, "grad_norm": 1.8310176134109497, "learning_rate": 1.3744148178302462e-05, "loss": 0.4974, "step": 769500 }, { "epoch": 2.1767645928036727, "grad_norm": 2.256953716278076, "learning_rate": 1.3720590119938786e-05, "loss": 0.5008, "step": 770000 }, { "epoch": 2.1781780763054934, "grad_norm": 1.7136662006378174, "learning_rate": 1.369703206157511e-05, "loss": 0.4992, "step": 770500 }, { "epoch": 2.179591559807314, "grad_norm": 1.873728632926941, "learning_rate": 1.3673474003211433e-05, "loss": 0.5002, "step": 771000 }, { "epoch": 2.1810050433091344, "grad_norm": 2.0753188133239746, "learning_rate": 1.364991594484776e-05, "loss": 0.4957, "step": 771500 }, { "epoch": 2.182418526810955, "grad_norm": 1.7614505290985107, "learning_rate": 1.3626357886484084e-05, "loss": 0.5033, "step": 772000 }, { "epoch": 2.1838320103127757, "grad_norm": 1.9992095232009888, "learning_rate": 1.3602799828120408e-05, "loss": 0.503, "step": 772500 }, { "epoch": 2.1852454938145964, "grad_norm": 1.7822327613830566, "learning_rate": 1.3579241769756731e-05, "loss": 0.4972, "step": 773000 }, { "epoch": 2.1866589773164167, "grad_norm": 1.923516869544983, "learning_rate": 1.3555683711393055e-05, "loss": 0.4971, "step": 773500 }, { "epoch": 2.1880724608182374, "grad_norm": 2.084929943084717, "learning_rate": 1.3532125653029379e-05, "loss": 0.4958, "step": 774000 }, { "epoch": 2.189485944320058, "grad_norm": 1.863441824913025, "learning_rate": 1.3508567594665702e-05, "loss": 0.4964, "step": 774500 }, { "epoch": 2.1908994278218783, "grad_norm": 2.0089516639709473, "learning_rate": 1.3485009536302026e-05, "loss": 0.4996, "step": 775000 }, { "epoch": 2.192312911323699, "grad_norm": 1.8849588632583618, "learning_rate": 1.346145147793835e-05, "loss": 0.5007, "step": 775500 }, { "epoch": 2.1937263948255197, "grad_norm": 1.8912904262542725, "learning_rate": 1.3437893419574673e-05, "loss": 0.5006, "step": 776000 }, { "epoch": 2.1951398783273404, "grad_norm": 1.8183059692382812, "learning_rate": 1.3414335361210997e-05, "loss": 0.5014, "step": 776500 }, { "epoch": 2.1965533618291606, "grad_norm": 1.8556183576583862, "learning_rate": 1.339077730284732e-05, "loss": 0.5015, "step": 777000 }, { "epoch": 2.1979668453309813, "grad_norm": 2.0111796855926514, "learning_rate": 1.3367219244483644e-05, "loss": 0.4953, "step": 777500 }, { "epoch": 2.199380328832802, "grad_norm": 1.8386739492416382, "learning_rate": 1.3343661186119968e-05, "loss": 0.4978, "step": 778000 }, { "epoch": 2.2007938123346222, "grad_norm": 1.9594324827194214, "learning_rate": 1.3320103127756292e-05, "loss": 0.4976, "step": 778500 }, { "epoch": 2.202207295836443, "grad_norm": 1.8209351301193237, "learning_rate": 1.3296545069392619e-05, "loss": 0.4999, "step": 779000 }, { "epoch": 2.2036207793382636, "grad_norm": 2.041916847229004, "learning_rate": 1.3272987011028942e-05, "loss": 0.5002, "step": 779500 }, { "epoch": 2.2050342628400843, "grad_norm": 1.8536685705184937, "learning_rate": 1.3249428952665266e-05, "loss": 0.5009, "step": 780000 }, { "epoch": 2.2064477463419045, "grad_norm": 1.8728760480880737, "learning_rate": 1.322587089430159e-05, "loss": 0.4978, "step": 780500 }, { "epoch": 2.2078612298437252, "grad_norm": 1.965193271636963, "learning_rate": 1.3202312835937914e-05, "loss": 0.4958, "step": 781000 }, { "epoch": 2.209274713345546, "grad_norm": 1.8471477031707764, "learning_rate": 1.3178754777574237e-05, "loss": 0.4991, "step": 781500 }, { "epoch": 2.210688196847366, "grad_norm": 2.0527541637420654, "learning_rate": 1.3155196719210561e-05, "loss": 0.4973, "step": 782000 }, { "epoch": 2.212101680349187, "grad_norm": 2.024754524230957, "learning_rate": 1.3131638660846885e-05, "loss": 0.4974, "step": 782500 }, { "epoch": 2.2135151638510076, "grad_norm": 1.8139668703079224, "learning_rate": 1.3108080602483208e-05, "loss": 0.4955, "step": 783000 }, { "epoch": 2.2149286473528282, "grad_norm": 1.8894438743591309, "learning_rate": 1.3084522544119532e-05, "loss": 0.4983, "step": 783500 }, { "epoch": 2.2163421308546485, "grad_norm": 2.0087132453918457, "learning_rate": 1.3060964485755856e-05, "loss": 0.5004, "step": 784000 }, { "epoch": 2.217755614356469, "grad_norm": 1.9600775241851807, "learning_rate": 1.303740642739218e-05, "loss": 0.5003, "step": 784500 }, { "epoch": 2.21916909785829, "grad_norm": 1.8303312063217163, "learning_rate": 1.3013848369028503e-05, "loss": 0.4976, "step": 785000 }, { "epoch": 2.2205825813601106, "grad_norm": 2.0498669147491455, "learning_rate": 1.2990290310664827e-05, "loss": 0.4968, "step": 785500 }, { "epoch": 2.221996064861931, "grad_norm": 2.2000315189361572, "learning_rate": 1.2966732252301154e-05, "loss": 0.5003, "step": 786000 }, { "epoch": 2.2234095483637515, "grad_norm": 1.6993693113327026, "learning_rate": 1.2943174193937477e-05, "loss": 0.4951, "step": 786500 }, { "epoch": 2.224823031865572, "grad_norm": 1.7949968576431274, "learning_rate": 1.2919616135573801e-05, "loss": 0.4978, "step": 787000 }, { "epoch": 2.2262365153673924, "grad_norm": 1.933198094367981, "learning_rate": 1.2896058077210125e-05, "loss": 0.4995, "step": 787500 }, { "epoch": 2.227649998869213, "grad_norm": 1.9566715955734253, "learning_rate": 1.2872500018846448e-05, "loss": 0.4957, "step": 788000 }, { "epoch": 2.229063482371034, "grad_norm": 1.9049962759017944, "learning_rate": 1.2848941960482772e-05, "loss": 0.4999, "step": 788500 }, { "epoch": 2.2304769658728545, "grad_norm": 1.758914828300476, "learning_rate": 1.2825383902119096e-05, "loss": 0.4992, "step": 789000 }, { "epoch": 2.2318904493746747, "grad_norm": 2.0515353679656982, "learning_rate": 1.280182584375542e-05, "loss": 0.494, "step": 789500 }, { "epoch": 2.2333039328764954, "grad_norm": 1.845266580581665, "learning_rate": 1.2778267785391743e-05, "loss": 0.4954, "step": 790000 }, { "epoch": 2.234717416378316, "grad_norm": 1.7661292552947998, "learning_rate": 1.2754709727028067e-05, "loss": 0.4915, "step": 790500 }, { "epoch": 2.236130899880137, "grad_norm": 1.9317253828048706, "learning_rate": 1.273115166866439e-05, "loss": 0.4953, "step": 791000 }, { "epoch": 2.237544383381957, "grad_norm": 1.846547245979309, "learning_rate": 1.2707593610300714e-05, "loss": 0.495, "step": 791500 }, { "epoch": 2.2389578668837777, "grad_norm": 1.8419394493103027, "learning_rate": 1.2684035551937038e-05, "loss": 0.4948, "step": 792000 }, { "epoch": 2.2403713503855984, "grad_norm": 1.7515804767608643, "learning_rate": 1.2660477493573361e-05, "loss": 0.5004, "step": 792500 }, { "epoch": 2.2417848338874187, "grad_norm": 1.982062578201294, "learning_rate": 1.2636919435209685e-05, "loss": 0.4923, "step": 793000 }, { "epoch": 2.2431983173892394, "grad_norm": 1.9448508024215698, "learning_rate": 1.261336137684601e-05, "loss": 0.4937, "step": 793500 }, { "epoch": 2.24461180089106, "grad_norm": 1.9259520769119263, "learning_rate": 1.2589803318482334e-05, "loss": 0.4974, "step": 794000 }, { "epoch": 2.2460252843928807, "grad_norm": 1.825293779373169, "learning_rate": 1.256624526011866e-05, "loss": 0.4958, "step": 794500 }, { "epoch": 2.247438767894701, "grad_norm": 2.093219757080078, "learning_rate": 1.2542687201754983e-05, "loss": 0.4955, "step": 795000 }, { "epoch": 2.2488522513965217, "grad_norm": 1.8751220703125, "learning_rate": 1.2519129143391307e-05, "loss": 0.4956, "step": 795500 }, { "epoch": 2.2502657348983424, "grad_norm": 1.8991122245788574, "learning_rate": 1.249557108502763e-05, "loss": 0.4965, "step": 796000 }, { "epoch": 2.251679218400163, "grad_norm": 1.8788878917694092, "learning_rate": 1.2472013026663954e-05, "loss": 0.4977, "step": 796500 }, { "epoch": 2.2530927019019833, "grad_norm": 1.921410322189331, "learning_rate": 1.2448454968300278e-05, "loss": 0.4959, "step": 797000 }, { "epoch": 2.254506185403804, "grad_norm": 1.6473151445388794, "learning_rate": 1.2424896909936602e-05, "loss": 0.4979, "step": 797500 }, { "epoch": 2.2559196689056247, "grad_norm": 1.9779410362243652, "learning_rate": 1.2401338851572925e-05, "loss": 0.4975, "step": 798000 }, { "epoch": 2.257333152407445, "grad_norm": 1.5984779596328735, "learning_rate": 1.2377780793209249e-05, "loss": 0.4927, "step": 798500 }, { "epoch": 2.2587466359092656, "grad_norm": 1.9285475015640259, "learning_rate": 1.2354222734845573e-05, "loss": 0.492, "step": 799000 }, { "epoch": 2.2601601194110863, "grad_norm": 1.892082929611206, "learning_rate": 1.2330664676481896e-05, "loss": 0.4936, "step": 799500 }, { "epoch": 2.261573602912907, "grad_norm": 1.7956061363220215, "learning_rate": 1.230710661811822e-05, "loss": 0.4927, "step": 800000 }, { "epoch": 2.2629870864147272, "grad_norm": 2.1109731197357178, "learning_rate": 1.2283548559754544e-05, "loss": 0.4972, "step": 800500 }, { "epoch": 2.264400569916548, "grad_norm": 1.9648841619491577, "learning_rate": 1.2259990501390867e-05, "loss": 0.4961, "step": 801000 }, { "epoch": 2.2658140534183686, "grad_norm": 1.8685731887817383, "learning_rate": 1.2236432443027191e-05, "loss": 0.4985, "step": 801500 }, { "epoch": 2.2672275369201893, "grad_norm": 2.024876356124878, "learning_rate": 1.2212874384663516e-05, "loss": 0.4939, "step": 802000 }, { "epoch": 2.2686410204220095, "grad_norm": 1.8625428676605225, "learning_rate": 1.218931632629984e-05, "loss": 0.493, "step": 802500 }, { "epoch": 2.2700545039238302, "grad_norm": 1.908742904663086, "learning_rate": 1.2165758267936164e-05, "loss": 0.4933, "step": 803000 }, { "epoch": 2.271467987425651, "grad_norm": 1.8099857568740845, "learning_rate": 1.2142200209572487e-05, "loss": 0.4952, "step": 803500 }, { "epoch": 2.272881470927471, "grad_norm": 1.870478868484497, "learning_rate": 1.2118642151208811e-05, "loss": 0.4931, "step": 804000 }, { "epoch": 2.274294954429292, "grad_norm": 1.8700393438339233, "learning_rate": 1.2095084092845135e-05, "loss": 0.4946, "step": 804500 }, { "epoch": 2.2757084379311125, "grad_norm": 1.8643546104431152, "learning_rate": 1.2071526034481458e-05, "loss": 0.4953, "step": 805000 }, { "epoch": 2.277121921432933, "grad_norm": 1.8416587114334106, "learning_rate": 1.2047967976117782e-05, "loss": 0.4953, "step": 805500 }, { "epoch": 2.2785354049347535, "grad_norm": 1.918830394744873, "learning_rate": 1.2024409917754107e-05, "loss": 0.4983, "step": 806000 }, { "epoch": 2.279948888436574, "grad_norm": 1.7947114706039429, "learning_rate": 1.2000851859390431e-05, "loss": 0.4946, "step": 806500 }, { "epoch": 2.281362371938395, "grad_norm": 1.6178207397460938, "learning_rate": 1.1977293801026755e-05, "loss": 0.4905, "step": 807000 }, { "epoch": 2.2827758554402156, "grad_norm": 1.9062438011169434, "learning_rate": 1.1953735742663078e-05, "loss": 0.4925, "step": 807500 }, { "epoch": 2.284189338942036, "grad_norm": 1.86580228805542, "learning_rate": 1.1930177684299402e-05, "loss": 0.4953, "step": 808000 }, { "epoch": 2.2856028224438565, "grad_norm": 1.8826972246170044, "learning_rate": 1.1906619625935726e-05, "loss": 0.4987, "step": 808500 }, { "epoch": 2.287016305945677, "grad_norm": 1.9593868255615234, "learning_rate": 1.188306156757205e-05, "loss": 0.4928, "step": 809000 }, { "epoch": 2.2884297894474974, "grad_norm": 1.9011677503585815, "learning_rate": 1.1859503509208375e-05, "loss": 0.4952, "step": 809500 }, { "epoch": 2.289843272949318, "grad_norm": 1.8397998809814453, "learning_rate": 1.1835945450844698e-05, "loss": 0.4959, "step": 810000 }, { "epoch": 2.291256756451139, "grad_norm": 1.7815238237380981, "learning_rate": 1.1812387392481022e-05, "loss": 0.4934, "step": 810500 }, { "epoch": 2.292670239952959, "grad_norm": 1.76918363571167, "learning_rate": 1.1788829334117346e-05, "loss": 0.4921, "step": 811000 }, { "epoch": 2.2940837234547797, "grad_norm": 1.8652615547180176, "learning_rate": 1.176527127575367e-05, "loss": 0.4932, "step": 811500 }, { "epoch": 2.2954972069566004, "grad_norm": 1.9286701679229736, "learning_rate": 1.1741713217389993e-05, "loss": 0.4932, "step": 812000 }, { "epoch": 2.296910690458421, "grad_norm": 1.9100277423858643, "learning_rate": 1.1718155159026317e-05, "loss": 0.4917, "step": 812500 }, { "epoch": 2.2983241739602414, "grad_norm": 2.010582208633423, "learning_rate": 1.1694597100662642e-05, "loss": 0.4945, "step": 813000 }, { "epoch": 2.299737657462062, "grad_norm": 1.9186463356018066, "learning_rate": 1.1671039042298966e-05, "loss": 0.4937, "step": 813500 }, { "epoch": 2.3011511409638827, "grad_norm": 1.8581461906433105, "learning_rate": 1.164748098393529e-05, "loss": 0.4935, "step": 814000 }, { "epoch": 2.3025646244657034, "grad_norm": 2.07281231880188, "learning_rate": 1.1623922925571613e-05, "loss": 0.4897, "step": 814500 }, { "epoch": 2.3039781079675237, "grad_norm": 1.8915587663650513, "learning_rate": 1.1600364867207937e-05, "loss": 0.4893, "step": 815000 }, { "epoch": 2.3053915914693444, "grad_norm": 2.036345958709717, "learning_rate": 1.157680680884426e-05, "loss": 0.4931, "step": 815500 }, { "epoch": 2.306805074971165, "grad_norm": 1.8789151906967163, "learning_rate": 1.1553248750480584e-05, "loss": 0.4933, "step": 816000 }, { "epoch": 2.3082185584729853, "grad_norm": 1.7548370361328125, "learning_rate": 1.1529690692116908e-05, "loss": 0.4969, "step": 816500 }, { "epoch": 2.309632041974806, "grad_norm": 2.00046968460083, "learning_rate": 1.1506132633753233e-05, "loss": 0.4936, "step": 817000 }, { "epoch": 2.3110455254766267, "grad_norm": 1.939008355140686, "learning_rate": 1.1482574575389557e-05, "loss": 0.4874, "step": 817500 }, { "epoch": 2.3124590089784474, "grad_norm": 2.0367040634155273, "learning_rate": 1.145901651702588e-05, "loss": 0.4936, "step": 818000 }, { "epoch": 2.3138724924802676, "grad_norm": 1.9299179315567017, "learning_rate": 1.1435458458662204e-05, "loss": 0.4958, "step": 818500 }, { "epoch": 2.3152859759820883, "grad_norm": 2.0582427978515625, "learning_rate": 1.1411900400298528e-05, "loss": 0.4898, "step": 819000 }, { "epoch": 2.316699459483909, "grad_norm": 1.9195109605789185, "learning_rate": 1.1388342341934852e-05, "loss": 0.4919, "step": 819500 }, { "epoch": 2.3181129429857297, "grad_norm": 1.9123340845108032, "learning_rate": 1.1364784283571175e-05, "loss": 0.4938, "step": 820000 }, { "epoch": 2.31952642648755, "grad_norm": 1.7881367206573486, "learning_rate": 1.13412262252075e-05, "loss": 0.4892, "step": 820500 }, { "epoch": 2.3209399099893706, "grad_norm": 1.961742877960205, "learning_rate": 1.1317668166843824e-05, "loss": 0.4959, "step": 821000 }, { "epoch": 2.3223533934911913, "grad_norm": 2.0317580699920654, "learning_rate": 1.1294110108480148e-05, "loss": 0.4916, "step": 821500 }, { "epoch": 2.3237668769930115, "grad_norm": 1.8701131343841553, "learning_rate": 1.1270552050116472e-05, "loss": 0.4895, "step": 822000 }, { "epoch": 2.3251803604948322, "grad_norm": 1.8490756750106812, "learning_rate": 1.1246993991752795e-05, "loss": 0.4891, "step": 822500 }, { "epoch": 2.326593843996653, "grad_norm": 1.8682472705841064, "learning_rate": 1.1223435933389119e-05, "loss": 0.4932, "step": 823000 }, { "epoch": 2.3280073274984736, "grad_norm": 1.8672633171081543, "learning_rate": 1.1199877875025443e-05, "loss": 0.4929, "step": 823500 }, { "epoch": 2.329420811000294, "grad_norm": 1.930944561958313, "learning_rate": 1.1176319816661768e-05, "loss": 0.4926, "step": 824000 }, { "epoch": 2.3308342945021145, "grad_norm": 2.039769411087036, "learning_rate": 1.1152761758298092e-05, "loss": 0.4918, "step": 824500 }, { "epoch": 2.3322477780039352, "grad_norm": 2.104445457458496, "learning_rate": 1.1129203699934415e-05, "loss": 0.4934, "step": 825000 }, { "epoch": 2.333661261505756, "grad_norm": 1.8117077350616455, "learning_rate": 1.110564564157074e-05, "loss": 0.491, "step": 825500 }, { "epoch": 2.335074745007576, "grad_norm": 1.924404263496399, "learning_rate": 1.1082087583207063e-05, "loss": 0.4926, "step": 826000 }, { "epoch": 2.336488228509397, "grad_norm": 1.9602303504943848, "learning_rate": 1.1058529524843387e-05, "loss": 0.4917, "step": 826500 }, { "epoch": 2.3379017120112175, "grad_norm": 1.7295727729797363, "learning_rate": 1.103497146647971e-05, "loss": 0.4913, "step": 827000 }, { "epoch": 2.339315195513038, "grad_norm": 2.1023011207580566, "learning_rate": 1.1011413408116036e-05, "loss": 0.4914, "step": 827500 }, { "epoch": 2.3407286790148585, "grad_norm": 1.721153974533081, "learning_rate": 1.098785534975236e-05, "loss": 0.4926, "step": 828000 }, { "epoch": 2.342142162516679, "grad_norm": 1.9268840551376343, "learning_rate": 1.0964297291388683e-05, "loss": 0.4943, "step": 828500 }, { "epoch": 2.3435556460185, "grad_norm": 2.0114941596984863, "learning_rate": 1.0940739233025007e-05, "loss": 0.4906, "step": 829000 }, { "epoch": 2.34496912952032, "grad_norm": 1.8542444705963135, "learning_rate": 1.091718117466133e-05, "loss": 0.4915, "step": 829500 }, { "epoch": 2.346382613022141, "grad_norm": 1.9367399215698242, "learning_rate": 1.0893623116297654e-05, "loss": 0.4927, "step": 830000 }, { "epoch": 2.3477960965239615, "grad_norm": 1.7886523008346558, "learning_rate": 1.0870065057933978e-05, "loss": 0.4915, "step": 830500 }, { "epoch": 2.349209580025782, "grad_norm": 2.030327320098877, "learning_rate": 1.0846506999570301e-05, "loss": 0.4908, "step": 831000 }, { "epoch": 2.3506230635276024, "grad_norm": 1.7950180768966675, "learning_rate": 1.0822948941206627e-05, "loss": 0.4885, "step": 831500 }, { "epoch": 2.352036547029423, "grad_norm": 1.9162943363189697, "learning_rate": 1.079939088284295e-05, "loss": 0.4906, "step": 832000 }, { "epoch": 2.353450030531244, "grad_norm": 1.7870396375656128, "learning_rate": 1.0775832824479274e-05, "loss": 0.4927, "step": 832500 }, { "epoch": 2.354863514033064, "grad_norm": 1.82160484790802, "learning_rate": 1.0752274766115598e-05, "loss": 0.491, "step": 833000 }, { "epoch": 2.3562769975348847, "grad_norm": 1.7914776802062988, "learning_rate": 1.0728716707751921e-05, "loss": 0.4896, "step": 833500 }, { "epoch": 2.3576904810367054, "grad_norm": 1.8668171167373657, "learning_rate": 1.0705158649388245e-05, "loss": 0.4942, "step": 834000 }, { "epoch": 2.3591039645385257, "grad_norm": 2.1333682537078857, "learning_rate": 1.0681600591024569e-05, "loss": 0.4909, "step": 834500 }, { "epoch": 2.3605174480403464, "grad_norm": 1.857965111732483, "learning_rate": 1.0658042532660892e-05, "loss": 0.4906, "step": 835000 }, { "epoch": 2.361930931542167, "grad_norm": 1.968218445777893, "learning_rate": 1.0634484474297218e-05, "loss": 0.4882, "step": 835500 }, { "epoch": 2.3633444150439877, "grad_norm": 2.047802686691284, "learning_rate": 1.0610926415933541e-05, "loss": 0.4896, "step": 836000 }, { "epoch": 2.364757898545808, "grad_norm": 1.8960331678390503, "learning_rate": 1.0587368357569865e-05, "loss": 0.4954, "step": 836500 }, { "epoch": 2.3661713820476287, "grad_norm": 1.7266817092895508, "learning_rate": 1.0563810299206189e-05, "loss": 0.4902, "step": 837000 }, { "epoch": 2.3675848655494494, "grad_norm": 1.997955560684204, "learning_rate": 1.0540252240842512e-05, "loss": 0.4903, "step": 837500 }, { "epoch": 2.36899834905127, "grad_norm": 1.7720695734024048, "learning_rate": 1.0516694182478836e-05, "loss": 0.4878, "step": 838000 }, { "epoch": 2.3704118325530903, "grad_norm": 1.9745116233825684, "learning_rate": 1.049313612411516e-05, "loss": 0.488, "step": 838500 }, { "epoch": 2.371825316054911, "grad_norm": 2.0489704608917236, "learning_rate": 1.0469578065751483e-05, "loss": 0.4921, "step": 839000 }, { "epoch": 2.3732387995567317, "grad_norm": 1.7937244176864624, "learning_rate": 1.0446020007387807e-05, "loss": 0.4886, "step": 839500 }, { "epoch": 2.374652283058552, "grad_norm": 1.8301666975021362, "learning_rate": 1.042246194902413e-05, "loss": 0.4898, "step": 840000 }, { "epoch": 2.3760657665603726, "grad_norm": 1.9129910469055176, "learning_rate": 1.0398903890660454e-05, "loss": 0.4906, "step": 840500 }, { "epoch": 2.3774792500621933, "grad_norm": 1.884525179862976, "learning_rate": 1.0375345832296778e-05, "loss": 0.4904, "step": 841000 }, { "epoch": 2.378892733564014, "grad_norm": 1.7935594320297241, "learning_rate": 1.0351787773933102e-05, "loss": 0.4891, "step": 841500 }, { "epoch": 2.3803062170658342, "grad_norm": 1.7183648347854614, "learning_rate": 1.0328229715569427e-05, "loss": 0.4884, "step": 842000 }, { "epoch": 2.381719700567655, "grad_norm": 2.0637903213500977, "learning_rate": 1.0304671657205751e-05, "loss": 0.4906, "step": 842500 }, { "epoch": 2.3831331840694756, "grad_norm": 1.9679921865463257, "learning_rate": 1.0281113598842075e-05, "loss": 0.4927, "step": 843000 }, { "epoch": 2.3845466675712963, "grad_norm": 1.575168251991272, "learning_rate": 1.0257555540478398e-05, "loss": 0.4884, "step": 843500 }, { "epoch": 2.3859601510731165, "grad_norm": 1.9503480195999146, "learning_rate": 1.0233997482114722e-05, "loss": 0.4891, "step": 844000 }, { "epoch": 2.3873736345749372, "grad_norm": 1.8788962364196777, "learning_rate": 1.0210439423751046e-05, "loss": 0.4911, "step": 844500 }, { "epoch": 2.388787118076758, "grad_norm": 2.042238712310791, "learning_rate": 1.018688136538737e-05, "loss": 0.4883, "step": 845000 }, { "epoch": 2.390200601578578, "grad_norm": 1.9492192268371582, "learning_rate": 1.0163323307023693e-05, "loss": 0.4884, "step": 845500 }, { "epoch": 2.391614085080399, "grad_norm": 1.894923448562622, "learning_rate": 1.0139765248660018e-05, "loss": 0.4882, "step": 846000 }, { "epoch": 2.3930275685822195, "grad_norm": 1.9473282098770142, "learning_rate": 1.0116207190296342e-05, "loss": 0.4883, "step": 846500 }, { "epoch": 2.3944410520840402, "grad_norm": 1.9197335243225098, "learning_rate": 1.0092649131932666e-05, "loss": 0.4872, "step": 847000 }, { "epoch": 2.3958545355858605, "grad_norm": 1.963704228401184, "learning_rate": 1.006909107356899e-05, "loss": 0.4898, "step": 847500 }, { "epoch": 2.397268019087681, "grad_norm": 1.745520830154419, "learning_rate": 1.0045533015205313e-05, "loss": 0.4906, "step": 848000 }, { "epoch": 2.398681502589502, "grad_norm": 1.8317735195159912, "learning_rate": 1.0021974956841637e-05, "loss": 0.4896, "step": 848500 }, { "epoch": 2.4000949860913225, "grad_norm": 2.0662906169891357, "learning_rate": 9.99841689847796e-06, "loss": 0.4904, "step": 849000 }, { "epoch": 2.401508469593143, "grad_norm": 1.971664547920227, "learning_rate": 9.974858840114286e-06, "loss": 0.4915, "step": 849500 }, { "epoch": 2.4029219530949635, "grad_norm": 1.8927421569824219, "learning_rate": 9.95130078175061e-06, "loss": 0.4888, "step": 850000 }, { "epoch": 2.404335436596784, "grad_norm": 1.8458671569824219, "learning_rate": 9.927742723386933e-06, "loss": 0.4867, "step": 850500 }, { "epoch": 2.4057489200986044, "grad_norm": 1.9615566730499268, "learning_rate": 9.904184665023257e-06, "loss": 0.4893, "step": 851000 }, { "epoch": 2.407162403600425, "grad_norm": 1.9216076135635376, "learning_rate": 9.88062660665958e-06, "loss": 0.4858, "step": 851500 }, { "epoch": 2.408575887102246, "grad_norm": 1.9650517702102661, "learning_rate": 9.857068548295904e-06, "loss": 0.492, "step": 852000 }, { "epoch": 2.4099893706040665, "grad_norm": 1.8412058353424072, "learning_rate": 9.833510489932228e-06, "loss": 0.4896, "step": 852500 }, { "epoch": 2.4114028541058867, "grad_norm": 1.7156356573104858, "learning_rate": 9.809952431568553e-06, "loss": 0.4892, "step": 853000 }, { "epoch": 2.4128163376077074, "grad_norm": 1.9220515489578247, "learning_rate": 9.786394373204877e-06, "loss": 0.4876, "step": 853500 }, { "epoch": 2.414229821109528, "grad_norm": 1.9614249467849731, "learning_rate": 9.7628363148412e-06, "loss": 0.4913, "step": 854000 }, { "epoch": 2.415643304611349, "grad_norm": 1.93386709690094, "learning_rate": 9.739278256477524e-06, "loss": 0.4856, "step": 854500 }, { "epoch": 2.417056788113169, "grad_norm": 1.9894529581069946, "learning_rate": 9.715720198113848e-06, "loss": 0.4896, "step": 855000 }, { "epoch": 2.4184702716149897, "grad_norm": 1.8312712907791138, "learning_rate": 9.692162139750171e-06, "loss": 0.4904, "step": 855500 }, { "epoch": 2.4198837551168104, "grad_norm": 1.9193555116653442, "learning_rate": 9.668604081386495e-06, "loss": 0.4911, "step": 856000 }, { "epoch": 2.4212972386186307, "grad_norm": 1.7844483852386475, "learning_rate": 9.64504602302282e-06, "loss": 0.4862, "step": 856500 }, { "epoch": 2.4227107221204514, "grad_norm": 1.8880608081817627, "learning_rate": 9.621487964659144e-06, "loss": 0.4861, "step": 857000 }, { "epoch": 2.424124205622272, "grad_norm": 1.9176058769226074, "learning_rate": 9.597929906295468e-06, "loss": 0.4904, "step": 857500 }, { "epoch": 2.4255376891240927, "grad_norm": 1.9004697799682617, "learning_rate": 9.574371847931792e-06, "loss": 0.4895, "step": 858000 }, { "epoch": 2.426951172625913, "grad_norm": 1.9729632139205933, "learning_rate": 9.550813789568115e-06, "loss": 0.4851, "step": 858500 }, { "epoch": 2.4283646561277337, "grad_norm": 1.6874969005584717, "learning_rate": 9.527255731204439e-06, "loss": 0.4895, "step": 859000 }, { "epoch": 2.4297781396295544, "grad_norm": 2.001664161682129, "learning_rate": 9.503697672840763e-06, "loss": 0.4859, "step": 859500 }, { "epoch": 2.431191623131375, "grad_norm": 2.103142023086548, "learning_rate": 9.480139614477086e-06, "loss": 0.489, "step": 860000 }, { "epoch": 2.4326051066331953, "grad_norm": 1.9392062425613403, "learning_rate": 9.456581556113412e-06, "loss": 0.4904, "step": 860500 }, { "epoch": 2.434018590135016, "grad_norm": 2.053927421569824, "learning_rate": 9.433023497749735e-06, "loss": 0.4875, "step": 861000 }, { "epoch": 2.4354320736368367, "grad_norm": 2.061859130859375, "learning_rate": 9.409465439386059e-06, "loss": 0.4832, "step": 861500 }, { "epoch": 2.436845557138657, "grad_norm": 1.9548988342285156, "learning_rate": 9.385907381022383e-06, "loss": 0.4872, "step": 862000 }, { "epoch": 2.4382590406404776, "grad_norm": 1.8615442514419556, "learning_rate": 9.362349322658706e-06, "loss": 0.4895, "step": 862500 }, { "epoch": 2.4396725241422983, "grad_norm": 2.0103771686553955, "learning_rate": 9.33879126429503e-06, "loss": 0.4921, "step": 863000 }, { "epoch": 2.4410860076441185, "grad_norm": 1.8544646501541138, "learning_rate": 9.315233205931354e-06, "loss": 0.4873, "step": 863500 }, { "epoch": 2.4424994911459392, "grad_norm": 1.8092107772827148, "learning_rate": 9.291675147567679e-06, "loss": 0.483, "step": 864000 }, { "epoch": 2.44391297464776, "grad_norm": 1.981996774673462, "learning_rate": 9.268117089204003e-06, "loss": 0.4847, "step": 864500 }, { "epoch": 2.4453264581495806, "grad_norm": 1.7634198665618896, "learning_rate": 9.244559030840326e-06, "loss": 0.4886, "step": 865000 }, { "epoch": 2.446739941651401, "grad_norm": 1.7768751382827759, "learning_rate": 9.22100097247665e-06, "loss": 0.4864, "step": 865500 }, { "epoch": 2.4481534251532215, "grad_norm": 1.733256459236145, "learning_rate": 9.197442914112974e-06, "loss": 0.4852, "step": 866000 }, { "epoch": 2.4495669086550422, "grad_norm": 2.041760206222534, "learning_rate": 9.173884855749297e-06, "loss": 0.4887, "step": 866500 }, { "epoch": 2.450980392156863, "grad_norm": 1.9992377758026123, "learning_rate": 9.150326797385621e-06, "loss": 0.4858, "step": 867000 }, { "epoch": 2.452393875658683, "grad_norm": 1.9723509550094604, "learning_rate": 9.126768739021946e-06, "loss": 0.4862, "step": 867500 }, { "epoch": 2.453807359160504, "grad_norm": 1.95931875705719, "learning_rate": 9.10321068065827e-06, "loss": 0.4862, "step": 868000 }, { "epoch": 2.4552208426623245, "grad_norm": 1.858932614326477, "learning_rate": 9.079652622294594e-06, "loss": 0.4881, "step": 868500 }, { "epoch": 2.456634326164145, "grad_norm": 2.139177083969116, "learning_rate": 9.056094563930917e-06, "loss": 0.4879, "step": 869000 }, { "epoch": 2.4580478096659655, "grad_norm": 1.8022249937057495, "learning_rate": 9.032536505567241e-06, "loss": 0.489, "step": 869500 }, { "epoch": 2.459461293167786, "grad_norm": 1.8559181690216064, "learning_rate": 9.008978447203565e-06, "loss": 0.4901, "step": 870000 }, { "epoch": 2.460874776669607, "grad_norm": 1.9003877639770508, "learning_rate": 8.985420388839888e-06, "loss": 0.4842, "step": 870500 }, { "epoch": 2.462288260171427, "grad_norm": 1.825510859489441, "learning_rate": 8.961862330476214e-06, "loss": 0.488, "step": 871000 }, { "epoch": 2.463701743673248, "grad_norm": 1.84860098361969, "learning_rate": 8.938304272112538e-06, "loss": 0.4869, "step": 871500 }, { "epoch": 2.4651152271750685, "grad_norm": 2.0540010929107666, "learning_rate": 8.914746213748861e-06, "loss": 0.4892, "step": 872000 }, { "epoch": 2.466528710676889, "grad_norm": 1.897650957107544, "learning_rate": 8.891188155385185e-06, "loss": 0.4847, "step": 872500 }, { "epoch": 2.4679421941787094, "grad_norm": 1.8715567588806152, "learning_rate": 8.867630097021509e-06, "loss": 0.4838, "step": 873000 }, { "epoch": 2.46935567768053, "grad_norm": 1.8824268579483032, "learning_rate": 8.844072038657832e-06, "loss": 0.4857, "step": 873500 }, { "epoch": 2.470769161182351, "grad_norm": 1.9322140216827393, "learning_rate": 8.820513980294156e-06, "loss": 0.4862, "step": 874000 }, { "epoch": 2.472182644684171, "grad_norm": 1.912814974784851, "learning_rate": 8.79695592193048e-06, "loss": 0.486, "step": 874500 }, { "epoch": 2.4735961281859917, "grad_norm": 1.8136541843414307, "learning_rate": 8.773397863566803e-06, "loss": 0.4852, "step": 875000 }, { "epoch": 2.4750096116878124, "grad_norm": 2.0755138397216797, "learning_rate": 8.749839805203127e-06, "loss": 0.4834, "step": 875500 }, { "epoch": 2.476423095189633, "grad_norm": 1.7509348392486572, "learning_rate": 8.726281746839452e-06, "loss": 0.4856, "step": 876000 }, { "epoch": 2.4778365786914534, "grad_norm": 1.7744121551513672, "learning_rate": 8.702723688475776e-06, "loss": 0.4874, "step": 876500 }, { "epoch": 2.479250062193274, "grad_norm": 1.9199438095092773, "learning_rate": 8.6791656301121e-06, "loss": 0.4895, "step": 877000 }, { "epoch": 2.4806635456950947, "grad_norm": 1.9941325187683105, "learning_rate": 8.655607571748423e-06, "loss": 0.4899, "step": 877500 }, { "epoch": 2.4820770291969154, "grad_norm": 1.7926734685897827, "learning_rate": 8.632049513384747e-06, "loss": 0.4843, "step": 878000 }, { "epoch": 2.4834905126987357, "grad_norm": 2.1546454429626465, "learning_rate": 8.60849145502107e-06, "loss": 0.4868, "step": 878500 }, { "epoch": 2.4849039962005564, "grad_norm": 1.8138697147369385, "learning_rate": 8.584933396657394e-06, "loss": 0.4859, "step": 879000 }, { "epoch": 2.486317479702377, "grad_norm": 1.8342360258102417, "learning_rate": 8.561375338293718e-06, "loss": 0.4827, "step": 879500 }, { "epoch": 2.4877309632041973, "grad_norm": 1.7119163274765015, "learning_rate": 8.537817279930042e-06, "loss": 0.4844, "step": 880000 }, { "epoch": 2.489144446706018, "grad_norm": 2.01804780960083, "learning_rate": 8.514259221566365e-06, "loss": 0.4864, "step": 880500 }, { "epoch": 2.4905579302078387, "grad_norm": 1.8369022607803345, "learning_rate": 8.490701163202689e-06, "loss": 0.4844, "step": 881000 }, { "epoch": 2.4919714137096594, "grad_norm": 2.0018656253814697, "learning_rate": 8.467143104839013e-06, "loss": 0.4873, "step": 881500 }, { "epoch": 2.4933848972114796, "grad_norm": 1.9026923179626465, "learning_rate": 8.443585046475338e-06, "loss": 0.4893, "step": 882000 }, { "epoch": 2.4947983807133003, "grad_norm": 1.971904993057251, "learning_rate": 8.420026988111662e-06, "loss": 0.4851, "step": 882500 }, { "epoch": 2.496211864215121, "grad_norm": 2.163181781768799, "learning_rate": 8.396468929747985e-06, "loss": 0.4877, "step": 883000 }, { "epoch": 2.4976253477169417, "grad_norm": 1.7946557998657227, "learning_rate": 8.372910871384309e-06, "loss": 0.4832, "step": 883500 }, { "epoch": 2.499038831218762, "grad_norm": 1.8066563606262207, "learning_rate": 8.349352813020633e-06, "loss": 0.4869, "step": 884000 }, { "epoch": 2.5004523147205826, "grad_norm": 2.0130603313446045, "learning_rate": 8.325794754656956e-06, "loss": 0.488, "step": 884500 }, { "epoch": 2.5018657982224033, "grad_norm": 1.7461382150650024, "learning_rate": 8.30223669629328e-06, "loss": 0.4805, "step": 885000 }, { "epoch": 2.5032792817242235, "grad_norm": 2.0204367637634277, "learning_rate": 8.278678637929605e-06, "loss": 0.4865, "step": 885500 }, { "epoch": 2.5046927652260442, "grad_norm": 1.8712173700332642, "learning_rate": 8.255120579565929e-06, "loss": 0.4863, "step": 886000 }, { "epoch": 2.506106248727865, "grad_norm": 1.9548075199127197, "learning_rate": 8.231562521202253e-06, "loss": 0.4867, "step": 886500 }, { "epoch": 2.507519732229685, "grad_norm": 1.9050769805908203, "learning_rate": 8.208004462838576e-06, "loss": 0.4884, "step": 887000 }, { "epoch": 2.508933215731506, "grad_norm": 1.8725768327713013, "learning_rate": 8.1844464044749e-06, "loss": 0.4861, "step": 887500 }, { "epoch": 2.5103466992333265, "grad_norm": 1.9408661127090454, "learning_rate": 8.160888346111224e-06, "loss": 0.4895, "step": 888000 }, { "epoch": 2.5117601827351472, "grad_norm": 2.054871082305908, "learning_rate": 8.137330287747548e-06, "loss": 0.4812, "step": 888500 }, { "epoch": 2.513173666236968, "grad_norm": 2.114215612411499, "learning_rate": 8.113772229383873e-06, "loss": 0.4831, "step": 889000 }, { "epoch": 2.514587149738788, "grad_norm": 2.145958185195923, "learning_rate": 8.090214171020197e-06, "loss": 0.4854, "step": 889500 }, { "epoch": 2.516000633240609, "grad_norm": 2.2729785442352295, "learning_rate": 8.06665611265652e-06, "loss": 0.4878, "step": 890000 }, { "epoch": 2.5174141167424295, "grad_norm": 2.0083484649658203, "learning_rate": 8.043098054292844e-06, "loss": 0.4869, "step": 890500 }, { "epoch": 2.51882760024425, "grad_norm": 1.8835361003875732, "learning_rate": 8.019539995929168e-06, "loss": 0.4844, "step": 891000 }, { "epoch": 2.5202410837460705, "grad_norm": 1.8772711753845215, "learning_rate": 7.995981937565491e-06, "loss": 0.4863, "step": 891500 }, { "epoch": 2.521654567247891, "grad_norm": 1.7579118013381958, "learning_rate": 7.972423879201815e-06, "loss": 0.4856, "step": 892000 }, { "epoch": 2.5230680507497114, "grad_norm": 1.9290488958358765, "learning_rate": 7.948865820838139e-06, "loss": 0.486, "step": 892500 }, { "epoch": 2.524481534251532, "grad_norm": 2.048842668533325, "learning_rate": 7.925307762474464e-06, "loss": 0.4847, "step": 893000 }, { "epoch": 2.525895017753353, "grad_norm": 2.001248836517334, "learning_rate": 7.901749704110788e-06, "loss": 0.4851, "step": 893500 }, { "epoch": 2.5273085012551735, "grad_norm": 1.8432012796401978, "learning_rate": 7.878191645747111e-06, "loss": 0.4877, "step": 894000 }, { "epoch": 2.528721984756994, "grad_norm": 1.853010654449463, "learning_rate": 7.854633587383435e-06, "loss": 0.4843, "step": 894500 }, { "epoch": 2.5301354682588144, "grad_norm": 2.038468837738037, "learning_rate": 7.831075529019759e-06, "loss": 0.4881, "step": 895000 }, { "epoch": 2.531548951760635, "grad_norm": 1.8302112817764282, "learning_rate": 7.807517470656082e-06, "loss": 0.4812, "step": 895500 }, { "epoch": 2.532962435262456, "grad_norm": 1.9210022687911987, "learning_rate": 7.783959412292406e-06, "loss": 0.4868, "step": 896000 }, { "epoch": 2.534375918764276, "grad_norm": 2.087862491607666, "learning_rate": 7.760401353928731e-06, "loss": 0.4857, "step": 896500 }, { "epoch": 2.5357894022660967, "grad_norm": 1.9049177169799805, "learning_rate": 7.736843295565055e-06, "loss": 0.4836, "step": 897000 }, { "epoch": 2.5372028857679174, "grad_norm": 1.8297357559204102, "learning_rate": 7.713285237201379e-06, "loss": 0.4828, "step": 897500 }, { "epoch": 2.5386163692697377, "grad_norm": 1.9057568311691284, "learning_rate": 7.689727178837702e-06, "loss": 0.4805, "step": 898000 }, { "epoch": 2.5400298527715583, "grad_norm": 1.784544825553894, "learning_rate": 7.666169120474026e-06, "loss": 0.4863, "step": 898500 }, { "epoch": 2.541443336273379, "grad_norm": 1.9096325635910034, "learning_rate": 7.64261106211035e-06, "loss": 0.4863, "step": 899000 }, { "epoch": 2.5428568197751997, "grad_norm": 1.8546833992004395, "learning_rate": 7.6190530037466734e-06, "loss": 0.4851, "step": 899500 }, { "epoch": 2.5442703032770204, "grad_norm": 1.966564655303955, "learning_rate": 7.595494945382999e-06, "loss": 0.4821, "step": 900000 }, { "epoch": 2.5456837867788407, "grad_norm": 1.9946645498275757, "learning_rate": 7.5719368870193225e-06, "loss": 0.4822, "step": 900500 }, { "epoch": 2.5470972702806614, "grad_norm": 1.9511921405792236, "learning_rate": 7.548378828655646e-06, "loss": 0.4829, "step": 901000 }, { "epoch": 2.548510753782482, "grad_norm": 1.9894767999649048, "learning_rate": 7.52482077029197e-06, "loss": 0.4867, "step": 901500 }, { "epoch": 2.5499242372843023, "grad_norm": 1.8694943189620972, "learning_rate": 7.5012627119282935e-06, "loss": 0.4802, "step": 902000 }, { "epoch": 2.551337720786123, "grad_norm": 1.8647996187210083, "learning_rate": 7.477704653564617e-06, "loss": 0.4812, "step": 902500 }, { "epoch": 2.5527512042879437, "grad_norm": 1.8344969749450684, "learning_rate": 7.454146595200941e-06, "loss": 0.4866, "step": 903000 }, { "epoch": 2.554164687789764, "grad_norm": 2.0349514484405518, "learning_rate": 7.430588536837265e-06, "loss": 0.4793, "step": 903500 }, { "epoch": 2.5555781712915846, "grad_norm": 1.8915636539459229, "learning_rate": 7.407030478473589e-06, "loss": 0.4821, "step": 904000 }, { "epoch": 2.5569916547934053, "grad_norm": 1.884355068206787, "learning_rate": 7.383472420109913e-06, "loss": 0.4832, "step": 904500 }, { "epoch": 2.558405138295226, "grad_norm": 1.8896831274032593, "learning_rate": 7.359914361746237e-06, "loss": 0.4824, "step": 905000 }, { "epoch": 2.5598186217970462, "grad_norm": 1.9848405122756958, "learning_rate": 7.336356303382561e-06, "loss": 0.4803, "step": 905500 }, { "epoch": 2.561232105298867, "grad_norm": 1.9907050132751465, "learning_rate": 7.312798245018885e-06, "loss": 0.4844, "step": 906000 }, { "epoch": 2.5626455888006876, "grad_norm": 1.9281100034713745, "learning_rate": 7.289240186655208e-06, "loss": 0.487, "step": 906500 }, { "epoch": 2.5640590723025083, "grad_norm": 2.2470903396606445, "learning_rate": 7.265682128291532e-06, "loss": 0.4845, "step": 907000 }, { "epoch": 2.5654725558043285, "grad_norm": 1.9744529724121094, "learning_rate": 7.2421240699278565e-06, "loss": 0.4823, "step": 907500 }, { "epoch": 2.5668860393061492, "grad_norm": 1.812247395515442, "learning_rate": 7.21856601156418e-06, "loss": 0.4798, "step": 908000 }, { "epoch": 2.56829952280797, "grad_norm": 1.8257344961166382, "learning_rate": 7.195007953200504e-06, "loss": 0.4845, "step": 908500 }, { "epoch": 2.56971300630979, "grad_norm": 1.8831816911697388, "learning_rate": 7.1714498948368275e-06, "loss": 0.4822, "step": 909000 }, { "epoch": 2.571126489811611, "grad_norm": 2.0315136909484863, "learning_rate": 7.147891836473151e-06, "loss": 0.4828, "step": 909500 }, { "epoch": 2.5725399733134315, "grad_norm": 2.0156962871551514, "learning_rate": 7.124333778109475e-06, "loss": 0.483, "step": 910000 }, { "epoch": 2.573953456815252, "grad_norm": 2.106574773788452, "learning_rate": 7.1007757197457985e-06, "loss": 0.4837, "step": 910500 }, { "epoch": 2.5753669403170725, "grad_norm": 2.117532968521118, "learning_rate": 7.077217661382124e-06, "loss": 0.4831, "step": 911000 }, { "epoch": 2.576780423818893, "grad_norm": 2.0272140502929688, "learning_rate": 7.0536596030184475e-06, "loss": 0.4824, "step": 911500 }, { "epoch": 2.578193907320714, "grad_norm": 1.8715615272521973, "learning_rate": 7.030101544654771e-06, "loss": 0.483, "step": 912000 }, { "epoch": 2.5796073908225345, "grad_norm": 1.775173544883728, "learning_rate": 7.006543486291095e-06, "loss": 0.4838, "step": 912500 }, { "epoch": 2.581020874324355, "grad_norm": 1.8357768058776855, "learning_rate": 6.9829854279274186e-06, "loss": 0.4816, "step": 913000 }, { "epoch": 2.5824343578261755, "grad_norm": 2.1198654174804688, "learning_rate": 6.959427369563742e-06, "loss": 0.4814, "step": 913500 }, { "epoch": 2.583847841327996, "grad_norm": 1.9847099781036377, "learning_rate": 6.935869311200066e-06, "loss": 0.4796, "step": 914000 }, { "epoch": 2.5852613248298164, "grad_norm": 1.850056767463684, "learning_rate": 6.912311252836391e-06, "loss": 0.4833, "step": 914500 }, { "epoch": 2.586674808331637, "grad_norm": 1.984686255455017, "learning_rate": 6.888753194472715e-06, "loss": 0.4812, "step": 915000 }, { "epoch": 2.588088291833458, "grad_norm": 1.9037647247314453, "learning_rate": 6.865195136109039e-06, "loss": 0.4838, "step": 915500 }, { "epoch": 2.589501775335278, "grad_norm": 1.9065098762512207, "learning_rate": 6.841637077745362e-06, "loss": 0.4807, "step": 916000 }, { "epoch": 2.5909152588370987, "grad_norm": 1.6963917016983032, "learning_rate": 6.818079019381686e-06, "loss": 0.4825, "step": 916500 }, { "epoch": 2.5923287423389194, "grad_norm": 1.902042269706726, "learning_rate": 6.79452096101801e-06, "loss": 0.4786, "step": 917000 }, { "epoch": 2.59374222584074, "grad_norm": 1.8548059463500977, "learning_rate": 6.770962902654333e-06, "loss": 0.4844, "step": 917500 }, { "epoch": 2.595155709342561, "grad_norm": 2.276184320449829, "learning_rate": 6.747404844290659e-06, "loss": 0.485, "step": 918000 }, { "epoch": 2.596569192844381, "grad_norm": 1.9523552656173706, "learning_rate": 6.723846785926982e-06, "loss": 0.4822, "step": 918500 }, { "epoch": 2.5979826763462017, "grad_norm": 2.0184359550476074, "learning_rate": 6.700288727563306e-06, "loss": 0.4855, "step": 919000 }, { "epoch": 2.5993961598480224, "grad_norm": 1.6621419191360474, "learning_rate": 6.67673066919963e-06, "loss": 0.476, "step": 919500 }, { "epoch": 2.6008096433498427, "grad_norm": 1.9704070091247559, "learning_rate": 6.653172610835953e-06, "loss": 0.4792, "step": 920000 }, { "epoch": 2.6022231268516633, "grad_norm": 1.9319995641708374, "learning_rate": 6.629614552472277e-06, "loss": 0.4846, "step": 920500 }, { "epoch": 2.603636610353484, "grad_norm": 2.0825893878936768, "learning_rate": 6.606056494108601e-06, "loss": 0.4827, "step": 921000 }, { "epoch": 2.6050500938553043, "grad_norm": 1.8574585914611816, "learning_rate": 6.582498435744924e-06, "loss": 0.4825, "step": 921500 }, { "epoch": 2.606463577357125, "grad_norm": 1.8330122232437134, "learning_rate": 6.55894037738125e-06, "loss": 0.4819, "step": 922000 }, { "epoch": 2.6078770608589457, "grad_norm": 2.029576063156128, "learning_rate": 6.5353823190175735e-06, "loss": 0.482, "step": 922500 }, { "epoch": 2.6092905443607664, "grad_norm": 2.022286891937256, "learning_rate": 6.511824260653897e-06, "loss": 0.4842, "step": 923000 }, { "epoch": 2.610704027862587, "grad_norm": 2.0089266300201416, "learning_rate": 6.488266202290221e-06, "loss": 0.4821, "step": 923500 }, { "epoch": 2.6121175113644073, "grad_norm": 2.069854259490967, "learning_rate": 6.4647081439265445e-06, "loss": 0.4843, "step": 924000 }, { "epoch": 2.613530994866228, "grad_norm": 1.9096248149871826, "learning_rate": 6.441150085562868e-06, "loss": 0.4783, "step": 924500 }, { "epoch": 2.6149444783680487, "grad_norm": 2.043044090270996, "learning_rate": 6.417592027199192e-06, "loss": 0.4821, "step": 925000 }, { "epoch": 2.616357961869869, "grad_norm": 1.8175281286239624, "learning_rate": 6.394033968835516e-06, "loss": 0.4811, "step": 925500 }, { "epoch": 2.6177714453716896, "grad_norm": 2.375560998916626, "learning_rate": 6.37047591047184e-06, "loss": 0.4813, "step": 926000 }, { "epoch": 2.6191849288735103, "grad_norm": 1.9718605279922485, "learning_rate": 6.346917852108164e-06, "loss": 0.4801, "step": 926500 }, { "epoch": 2.6205984123753305, "grad_norm": 2.0075249671936035, "learning_rate": 6.323359793744487e-06, "loss": 0.48, "step": 927000 }, { "epoch": 2.622011895877151, "grad_norm": 1.878230094909668, "learning_rate": 6.299801735380811e-06, "loss": 0.4825, "step": 927500 }, { "epoch": 2.623425379378972, "grad_norm": 1.8463152647018433, "learning_rate": 6.276243677017135e-06, "loss": 0.4782, "step": 928000 }, { "epoch": 2.6248388628807926, "grad_norm": 1.8635910749435425, "learning_rate": 6.252685618653458e-06, "loss": 0.4785, "step": 928500 }, { "epoch": 2.6262523463826133, "grad_norm": 1.8148760795593262, "learning_rate": 6.229127560289783e-06, "loss": 0.484, "step": 929000 }, { "epoch": 2.6276658298844335, "grad_norm": 1.9863587617874146, "learning_rate": 6.205569501926107e-06, "loss": 0.4809, "step": 929500 }, { "epoch": 2.6290793133862542, "grad_norm": 1.8167134523391724, "learning_rate": 6.182011443562431e-06, "loss": 0.4818, "step": 930000 }, { "epoch": 2.630492796888075, "grad_norm": 1.8909714221954346, "learning_rate": 6.158453385198755e-06, "loss": 0.4788, "step": 930500 }, { "epoch": 2.631906280389895, "grad_norm": 1.8027583360671997, "learning_rate": 6.1348953268350785e-06, "loss": 0.4817, "step": 931000 }, { "epoch": 2.633319763891716, "grad_norm": 1.9182924032211304, "learning_rate": 6.111337268471402e-06, "loss": 0.4792, "step": 931500 }, { "epoch": 2.6347332473935365, "grad_norm": 1.9994778633117676, "learning_rate": 6.087779210107727e-06, "loss": 0.4818, "step": 932000 }, { "epoch": 2.636146730895357, "grad_norm": 1.8693245649337769, "learning_rate": 6.06422115174405e-06, "loss": 0.4821, "step": 932500 }, { "epoch": 2.6375602143971775, "grad_norm": 1.8640729188919067, "learning_rate": 6.040663093380374e-06, "loss": 0.482, "step": 933000 }, { "epoch": 2.638973697898998, "grad_norm": 1.7601839303970337, "learning_rate": 6.0171050350166985e-06, "loss": 0.4782, "step": 933500 }, { "epoch": 2.640387181400819, "grad_norm": 1.7608891725540161, "learning_rate": 5.993546976653022e-06, "loss": 0.4836, "step": 934000 }, { "epoch": 2.641800664902639, "grad_norm": 1.9892377853393555, "learning_rate": 5.969988918289346e-06, "loss": 0.4814, "step": 934500 }, { "epoch": 2.64321414840446, "grad_norm": 1.9059171676635742, "learning_rate": 5.9464308599256695e-06, "loss": 0.4785, "step": 935000 }, { "epoch": 2.6446276319062805, "grad_norm": 1.9803173542022705, "learning_rate": 5.922872801561994e-06, "loss": 0.4778, "step": 935500 }, { "epoch": 2.646041115408101, "grad_norm": 1.7821260690689087, "learning_rate": 5.899314743198318e-06, "loss": 0.481, "step": 936000 }, { "epoch": 2.6474545989099214, "grad_norm": 1.7640492916107178, "learning_rate": 5.875756684834641e-06, "loss": 0.4827, "step": 936500 }, { "epoch": 2.648868082411742, "grad_norm": 2.1856091022491455, "learning_rate": 5.852198626470966e-06, "loss": 0.4815, "step": 937000 }, { "epoch": 2.650281565913563, "grad_norm": 1.7605998516082764, "learning_rate": 5.82864056810729e-06, "loss": 0.4781, "step": 937500 }, { "epoch": 2.651695049415383, "grad_norm": 1.9266194105148315, "learning_rate": 5.805082509743613e-06, "loss": 0.4827, "step": 938000 }, { "epoch": 2.6531085329172037, "grad_norm": 1.9063451290130615, "learning_rate": 5.781524451379937e-06, "loss": 0.477, "step": 938500 }, { "epoch": 2.6545220164190244, "grad_norm": 1.8465832471847534, "learning_rate": 5.7579663930162615e-06, "loss": 0.4796, "step": 939000 }, { "epoch": 2.6559354999208447, "grad_norm": 1.9928085803985596, "learning_rate": 5.734408334652585e-06, "loss": 0.4809, "step": 939500 }, { "epoch": 2.6573489834226653, "grad_norm": 1.850600242614746, "learning_rate": 5.710850276288909e-06, "loss": 0.4839, "step": 940000 }, { "epoch": 2.658762466924486, "grad_norm": 1.9353916645050049, "learning_rate": 5.6872922179252325e-06, "loss": 0.4793, "step": 940500 }, { "epoch": 2.6601759504263067, "grad_norm": 1.7479346990585327, "learning_rate": 5.663734159561557e-06, "loss": 0.4777, "step": 941000 }, { "epoch": 2.6615894339281274, "grad_norm": 1.9903581142425537, "learning_rate": 5.640176101197881e-06, "loss": 0.4809, "step": 941500 }, { "epoch": 2.6630029174299477, "grad_norm": 2.0518722534179688, "learning_rate": 5.616618042834204e-06, "loss": 0.4764, "step": 942000 }, { "epoch": 2.6644164009317683, "grad_norm": 1.865896224975586, "learning_rate": 5.593059984470529e-06, "loss": 0.4785, "step": 942500 }, { "epoch": 2.665829884433589, "grad_norm": 1.814146876335144, "learning_rate": 5.5695019261068526e-06, "loss": 0.4806, "step": 943000 }, { "epoch": 2.6672433679354093, "grad_norm": 1.9471062421798706, "learning_rate": 5.545943867743176e-06, "loss": 0.4769, "step": 943500 }, { "epoch": 2.66865685143723, "grad_norm": 1.6906211376190186, "learning_rate": 5.5223858093795e-06, "loss": 0.4778, "step": 944000 }, { "epoch": 2.6700703349390507, "grad_norm": 1.8644707202911377, "learning_rate": 5.498827751015824e-06, "loss": 0.4788, "step": 944500 }, { "epoch": 2.671483818440871, "grad_norm": 1.9732451438903809, "learning_rate": 5.475269692652147e-06, "loss": 0.4785, "step": 945000 }, { "epoch": 2.6728973019426916, "grad_norm": 2.0173141956329346, "learning_rate": 5.451711634288472e-06, "loss": 0.481, "step": 945500 }, { "epoch": 2.6743107854445123, "grad_norm": 1.8937244415283203, "learning_rate": 5.4281535759247955e-06, "loss": 0.4774, "step": 946000 }, { "epoch": 2.675724268946333, "grad_norm": 1.9488162994384766, "learning_rate": 5.404595517561119e-06, "loss": 0.4788, "step": 946500 }, { "epoch": 2.6771377524481537, "grad_norm": 1.6848728656768799, "learning_rate": 5.381037459197443e-06, "loss": 0.476, "step": 947000 }, { "epoch": 2.678551235949974, "grad_norm": 1.8535088300704956, "learning_rate": 5.3574794008337665e-06, "loss": 0.4814, "step": 947500 }, { "epoch": 2.6799647194517946, "grad_norm": 1.9320138692855835, "learning_rate": 5.333921342470091e-06, "loss": 0.4741, "step": 948000 }, { "epoch": 2.6813782029536153, "grad_norm": 1.7542088031768799, "learning_rate": 5.310363284106415e-06, "loss": 0.4788, "step": 948500 }, { "epoch": 2.6827916864554355, "grad_norm": 2.1810462474823, "learning_rate": 5.286805225742738e-06, "loss": 0.4774, "step": 949000 }, { "epoch": 2.684205169957256, "grad_norm": 2.1277527809143066, "learning_rate": 5.263247167379062e-06, "loss": 0.4805, "step": 949500 }, { "epoch": 2.685618653459077, "grad_norm": 1.7599092721939087, "learning_rate": 5.2396891090153865e-06, "loss": 0.4842, "step": 950000 }, { "epoch": 2.687032136960897, "grad_norm": 1.9173028469085693, "learning_rate": 5.21613105065171e-06, "loss": 0.4792, "step": 950500 }, { "epoch": 2.688445620462718, "grad_norm": 1.896518349647522, "learning_rate": 5.192572992288034e-06, "loss": 0.4799, "step": 951000 }, { "epoch": 2.6898591039645385, "grad_norm": 1.9007881879806519, "learning_rate": 5.169014933924358e-06, "loss": 0.4813, "step": 951500 }, { "epoch": 2.6912725874663592, "grad_norm": 1.9784225225448608, "learning_rate": 5.145456875560682e-06, "loss": 0.4776, "step": 952000 }, { "epoch": 2.69268607096818, "grad_norm": 1.9315286874771118, "learning_rate": 5.121898817197006e-06, "loss": 0.4797, "step": 952500 }, { "epoch": 2.69409955447, "grad_norm": 1.756608247756958, "learning_rate": 5.0983407588333294e-06, "loss": 0.4745, "step": 953000 }, { "epoch": 2.695513037971821, "grad_norm": 1.9761390686035156, "learning_rate": 5.074782700469654e-06, "loss": 0.4837, "step": 953500 }, { "epoch": 2.6969265214736415, "grad_norm": 1.8754701614379883, "learning_rate": 5.051224642105978e-06, "loss": 0.4792, "step": 954000 }, { "epoch": 2.698340004975462, "grad_norm": 1.8909083604812622, "learning_rate": 5.027666583742301e-06, "loss": 0.4772, "step": 954500 }, { "epoch": 2.6997534884772825, "grad_norm": 1.8627945184707642, "learning_rate": 5.004108525378625e-06, "loss": 0.4732, "step": 955000 }, { "epoch": 2.701166971979103, "grad_norm": 1.9277786016464233, "learning_rate": 4.9805504670149495e-06, "loss": 0.4777, "step": 955500 }, { "epoch": 2.7025804554809234, "grad_norm": 1.9882420301437378, "learning_rate": 4.956992408651273e-06, "loss": 0.4823, "step": 956000 }, { "epoch": 2.703993938982744, "grad_norm": 1.9477229118347168, "learning_rate": 4.933434350287597e-06, "loss": 0.4754, "step": 956500 }, { "epoch": 2.705407422484565, "grad_norm": 1.8022792339324951, "learning_rate": 4.909876291923921e-06, "loss": 0.4836, "step": 957000 }, { "epoch": 2.7068209059863855, "grad_norm": 1.9644250869750977, "learning_rate": 4.886318233560245e-06, "loss": 0.4795, "step": 957500 }, { "epoch": 2.708234389488206, "grad_norm": 2.139726400375366, "learning_rate": 4.862760175196569e-06, "loss": 0.4792, "step": 958000 }, { "epoch": 2.7096478729900264, "grad_norm": 2.033918619155884, "learning_rate": 4.839202116832892e-06, "loss": 0.4742, "step": 958500 }, { "epoch": 2.711061356491847, "grad_norm": 2.2704405784606934, "learning_rate": 4.815644058469217e-06, "loss": 0.4806, "step": 959000 }, { "epoch": 2.712474839993668, "grad_norm": 1.915846347808838, "learning_rate": 4.792086000105541e-06, "loss": 0.4802, "step": 959500 }, { "epoch": 2.713888323495488, "grad_norm": 2.1745569705963135, "learning_rate": 4.768527941741864e-06, "loss": 0.4759, "step": 960000 }, { "epoch": 2.7153018069973087, "grad_norm": 2.1320128440856934, "learning_rate": 4.744969883378188e-06, "loss": 0.4798, "step": 960500 }, { "epoch": 2.7167152904991294, "grad_norm": 1.7251592874526978, "learning_rate": 4.7214118250145125e-06, "loss": 0.4776, "step": 961000 }, { "epoch": 2.7181287740009497, "grad_norm": 1.9627765417099, "learning_rate": 4.697853766650836e-06, "loss": 0.4766, "step": 961500 }, { "epoch": 2.7195422575027703, "grad_norm": 1.7912107706069946, "learning_rate": 4.67429570828716e-06, "loss": 0.4788, "step": 962000 }, { "epoch": 2.720955741004591, "grad_norm": 1.8440243005752563, "learning_rate": 4.650737649923484e-06, "loss": 0.4772, "step": 962500 }, { "epoch": 2.7223692245064117, "grad_norm": 1.9037599563598633, "learning_rate": 4.627179591559808e-06, "loss": 0.4787, "step": 963000 }, { "epoch": 2.723782708008232, "grad_norm": 1.8607136011123657, "learning_rate": 4.603621533196132e-06, "loss": 0.4741, "step": 963500 }, { "epoch": 2.7251961915100527, "grad_norm": 1.6884994506835938, "learning_rate": 4.580063474832455e-06, "loss": 0.4768, "step": 964000 }, { "epoch": 2.7266096750118733, "grad_norm": 1.8917460441589355, "learning_rate": 4.556505416468779e-06, "loss": 0.4766, "step": 964500 }, { "epoch": 2.728023158513694, "grad_norm": 1.861642599105835, "learning_rate": 4.532947358105103e-06, "loss": 0.4804, "step": 965000 }, { "epoch": 2.7294366420155143, "grad_norm": 1.982200026512146, "learning_rate": 4.509389299741426e-06, "loss": 0.4765, "step": 965500 }, { "epoch": 2.730850125517335, "grad_norm": 1.8984880447387695, "learning_rate": 4.485831241377751e-06, "loss": 0.4739, "step": 966000 }, { "epoch": 2.7322636090191557, "grad_norm": 1.9732983112335205, "learning_rate": 4.4622731830140746e-06, "loss": 0.4762, "step": 966500 }, { "epoch": 2.733677092520976, "grad_norm": 1.7589553594589233, "learning_rate": 4.438715124650398e-06, "loss": 0.4757, "step": 967000 }, { "epoch": 2.7350905760227966, "grad_norm": 1.9162185192108154, "learning_rate": 4.415157066286722e-06, "loss": 0.4754, "step": 967500 }, { "epoch": 2.7365040595246173, "grad_norm": 1.9831218719482422, "learning_rate": 4.3915990079230464e-06, "loss": 0.4751, "step": 968000 }, { "epoch": 2.7379175430264375, "grad_norm": 1.9715999364852905, "learning_rate": 4.36804094955937e-06, "loss": 0.4767, "step": 968500 }, { "epoch": 2.739331026528258, "grad_norm": 1.9930860996246338, "learning_rate": 4.344482891195694e-06, "loss": 0.474, "step": 969000 }, { "epoch": 2.740744510030079, "grad_norm": 1.7594940662384033, "learning_rate": 4.3209248328320175e-06, "loss": 0.4797, "step": 969500 }, { "epoch": 2.7421579935318996, "grad_norm": 2.0231380462646484, "learning_rate": 4.297366774468342e-06, "loss": 0.4773, "step": 970000 }, { "epoch": 2.7435714770337203, "grad_norm": 1.895690679550171, "learning_rate": 4.273808716104666e-06, "loss": 0.4775, "step": 970500 }, { "epoch": 2.7449849605355405, "grad_norm": 1.7399858236312866, "learning_rate": 4.250250657740989e-06, "loss": 0.48, "step": 971000 }, { "epoch": 2.746398444037361, "grad_norm": 1.9189199209213257, "learning_rate": 4.226692599377314e-06, "loss": 0.4811, "step": 971500 }, { "epoch": 2.747811927539182, "grad_norm": 1.9197585582733154, "learning_rate": 4.2031345410136375e-06, "loss": 0.4778, "step": 972000 }, { "epoch": 2.749225411041002, "grad_norm": 1.908904790878296, "learning_rate": 4.179576482649961e-06, "loss": 0.4834, "step": 972500 }, { "epoch": 2.750638894542823, "grad_norm": 1.9503720998764038, "learning_rate": 4.156018424286285e-06, "loss": 0.4751, "step": 973000 }, { "epoch": 2.7520523780446435, "grad_norm": 1.8846056461334229, "learning_rate": 4.132460365922609e-06, "loss": 0.4743, "step": 973500 }, { "epoch": 2.7534658615464638, "grad_norm": 1.8622273206710815, "learning_rate": 4.108902307558933e-06, "loss": 0.4754, "step": 974000 }, { "epoch": 2.7548793450482845, "grad_norm": 1.88423490524292, "learning_rate": 4.085344249195257e-06, "loss": 0.479, "step": 974500 }, { "epoch": 2.756292828550105, "grad_norm": 1.9103281497955322, "learning_rate": 4.06178619083158e-06, "loss": 0.4767, "step": 975000 }, { "epoch": 2.757706312051926, "grad_norm": 1.8493406772613525, "learning_rate": 4.038228132467905e-06, "loss": 0.4761, "step": 975500 }, { "epoch": 2.7591197955537465, "grad_norm": 1.7773460149765015, "learning_rate": 4.014670074104229e-06, "loss": 0.4768, "step": 976000 }, { "epoch": 2.760533279055567, "grad_norm": 1.9112446308135986, "learning_rate": 3.991112015740552e-06, "loss": 0.4728, "step": 976500 }, { "epoch": 2.7619467625573875, "grad_norm": 1.7441248893737793, "learning_rate": 3.967553957376877e-06, "loss": 0.4769, "step": 977000 }, { "epoch": 2.763360246059208, "grad_norm": 2.1637072563171387, "learning_rate": 3.9439958990132005e-06, "loss": 0.477, "step": 977500 }, { "epoch": 2.7647737295610284, "grad_norm": 2.1812381744384766, "learning_rate": 3.920437840649524e-06, "loss": 0.4772, "step": 978000 }, { "epoch": 2.766187213062849, "grad_norm": 1.7566249370574951, "learning_rate": 3.896879782285848e-06, "loss": 0.4746, "step": 978500 }, { "epoch": 2.76760069656467, "grad_norm": 1.9075323343276978, "learning_rate": 3.873321723922172e-06, "loss": 0.4812, "step": 979000 }, { "epoch": 2.76901418006649, "grad_norm": 1.941957950592041, "learning_rate": 3.849763665558496e-06, "loss": 0.4774, "step": 979500 }, { "epoch": 2.7704276635683107, "grad_norm": 1.8558536767959595, "learning_rate": 3.82620560719482e-06, "loss": 0.4803, "step": 980000 }, { "epoch": 2.7718411470701314, "grad_norm": 1.837294101715088, "learning_rate": 3.802647548831144e-06, "loss": 0.4756, "step": 980500 }, { "epoch": 2.773254630571952, "grad_norm": 1.9312266111373901, "learning_rate": 3.7790894904674675e-06, "loss": 0.4783, "step": 981000 }, { "epoch": 2.774668114073773, "grad_norm": 2.0401787757873535, "learning_rate": 3.755531432103791e-06, "loss": 0.4741, "step": 981500 }, { "epoch": 2.776081597575593, "grad_norm": 1.856086015701294, "learning_rate": 3.731973373740115e-06, "loss": 0.4783, "step": 982000 }, { "epoch": 2.7774950810774137, "grad_norm": 2.1311733722686768, "learning_rate": 3.7084153153764393e-06, "loss": 0.473, "step": 982500 }, { "epoch": 2.7789085645792344, "grad_norm": 2.1316375732421875, "learning_rate": 3.684857257012763e-06, "loss": 0.4721, "step": 983000 }, { "epoch": 2.7803220480810547, "grad_norm": 1.9547605514526367, "learning_rate": 3.6612991986490867e-06, "loss": 0.477, "step": 983500 }, { "epoch": 2.7817355315828753, "grad_norm": 1.9322706460952759, "learning_rate": 3.6377411402854104e-06, "loss": 0.4766, "step": 984000 }, { "epoch": 2.783149015084696, "grad_norm": 1.8734384775161743, "learning_rate": 3.614183081921735e-06, "loss": 0.4752, "step": 984500 }, { "epoch": 2.7845624985865163, "grad_norm": 1.8073605298995972, "learning_rate": 3.5906250235580586e-06, "loss": 0.4755, "step": 985000 }, { "epoch": 2.785975982088337, "grad_norm": 1.9522842168807983, "learning_rate": 3.5670669651943822e-06, "loss": 0.4744, "step": 985500 }, { "epoch": 2.7873894655901577, "grad_norm": 1.6247305870056152, "learning_rate": 3.5435089068307068e-06, "loss": 0.4763, "step": 986000 }, { "epoch": 2.7888029490919783, "grad_norm": 1.8694617748260498, "learning_rate": 3.5199508484670304e-06, "loss": 0.4742, "step": 986500 }, { "epoch": 2.790216432593799, "grad_norm": 1.949827790260315, "learning_rate": 3.496392790103354e-06, "loss": 0.4742, "step": 987000 }, { "epoch": 2.7916299160956193, "grad_norm": 1.8632761240005493, "learning_rate": 3.4728347317396778e-06, "loss": 0.476, "step": 987500 }, { "epoch": 2.79304339959744, "grad_norm": 1.8900924921035767, "learning_rate": 3.449276673376002e-06, "loss": 0.4744, "step": 988000 }, { "epoch": 2.7944568830992607, "grad_norm": 1.8881003856658936, "learning_rate": 3.425718615012326e-06, "loss": 0.4792, "step": 988500 }, { "epoch": 2.795870366601081, "grad_norm": 1.8138952255249023, "learning_rate": 3.4021605566486496e-06, "loss": 0.475, "step": 989000 }, { "epoch": 2.7972838501029016, "grad_norm": 1.9786615371704102, "learning_rate": 3.3786024982849737e-06, "loss": 0.4744, "step": 989500 }, { "epoch": 2.7986973336047223, "grad_norm": 1.8896914720535278, "learning_rate": 3.3550444399212974e-06, "loss": 0.474, "step": 990000 }, { "epoch": 2.8001108171065425, "grad_norm": 2.0412654876708984, "learning_rate": 3.331486381557621e-06, "loss": 0.4753, "step": 990500 }, { "epoch": 2.801524300608363, "grad_norm": 1.7360100746154785, "learning_rate": 3.3079283231939448e-06, "loss": 0.4763, "step": 991000 }, { "epoch": 2.802937784110184, "grad_norm": 1.7852778434753418, "learning_rate": 3.2843702648302693e-06, "loss": 0.4763, "step": 991500 }, { "epoch": 2.8043512676120046, "grad_norm": 2.0028610229492188, "learning_rate": 3.260812206466593e-06, "loss": 0.4747, "step": 992000 }, { "epoch": 2.805764751113825, "grad_norm": 1.981197476387024, "learning_rate": 3.2372541481029166e-06, "loss": 0.4721, "step": 992500 }, { "epoch": 2.8071782346156455, "grad_norm": 1.8272744417190552, "learning_rate": 3.2136960897392403e-06, "loss": 0.4736, "step": 993000 }, { "epoch": 2.808591718117466, "grad_norm": 1.9510681629180908, "learning_rate": 3.190138031375565e-06, "loss": 0.4736, "step": 993500 }, { "epoch": 2.810005201619287, "grad_norm": 1.9202264547348022, "learning_rate": 3.1665799730118885e-06, "loss": 0.4821, "step": 994000 }, { "epoch": 2.811418685121107, "grad_norm": 1.850290060043335, "learning_rate": 3.143021914648212e-06, "loss": 0.4799, "step": 994500 }, { "epoch": 2.812832168622928, "grad_norm": 1.9096814393997192, "learning_rate": 3.1194638562845363e-06, "loss": 0.4741, "step": 995000 }, { "epoch": 2.8142456521247485, "grad_norm": 1.8525327444076538, "learning_rate": 3.0959057979208604e-06, "loss": 0.476, "step": 995500 }, { "epoch": 2.8156591356265688, "grad_norm": 1.834235668182373, "learning_rate": 3.072347739557184e-06, "loss": 0.4758, "step": 996000 }, { "epoch": 2.8170726191283895, "grad_norm": 1.7803674936294556, "learning_rate": 3.048789681193508e-06, "loss": 0.4755, "step": 996500 }, { "epoch": 2.81848610263021, "grad_norm": 1.993950605392456, "learning_rate": 3.025231622829832e-06, "loss": 0.4714, "step": 997000 }, { "epoch": 2.8198995861320304, "grad_norm": 1.8939605951309204, "learning_rate": 3.001673564466156e-06, "loss": 0.4755, "step": 997500 }, { "epoch": 2.821313069633851, "grad_norm": 1.9293370246887207, "learning_rate": 2.9781155061024796e-06, "loss": 0.4738, "step": 998000 }, { "epoch": 2.822726553135672, "grad_norm": 2.011793851852417, "learning_rate": 2.9545574477388037e-06, "loss": 0.4722, "step": 998500 }, { "epoch": 2.8241400366374925, "grad_norm": 1.9483085870742798, "learning_rate": 2.9309993893751274e-06, "loss": 0.4748, "step": 999000 }, { "epoch": 2.825553520139313, "grad_norm": 1.9889156818389893, "learning_rate": 2.907441331011451e-06, "loss": 0.4765, "step": 999500 }, { "epoch": 2.8269670036411334, "grad_norm": 1.9352165460586548, "learning_rate": 2.883883272647775e-06, "loss": 0.4761, "step": 1000000 }, { "epoch": 2.828380487142954, "grad_norm": 1.8703913688659668, "learning_rate": 2.860325214284099e-06, "loss": 0.4745, "step": 1000500 }, { "epoch": 2.829793970644775, "grad_norm": 1.8343589305877686, "learning_rate": 2.836767155920423e-06, "loss": 0.4729, "step": 1001000 }, { "epoch": 2.831207454146595, "grad_norm": 1.9111852645874023, "learning_rate": 2.8132090975567466e-06, "loss": 0.4734, "step": 1001500 }, { "epoch": 2.8326209376484157, "grad_norm": 2.024245023727417, "learning_rate": 2.7896510391930707e-06, "loss": 0.4787, "step": 1002000 }, { "epoch": 2.8340344211502364, "grad_norm": 1.9577772617340088, "learning_rate": 2.7660929808293944e-06, "loss": 0.4741, "step": 1002500 }, { "epoch": 2.8354479046520566, "grad_norm": 2.0325756072998047, "learning_rate": 2.7425349224657185e-06, "loss": 0.4741, "step": 1003000 }, { "epoch": 2.8368613881538773, "grad_norm": 2.0980920791625977, "learning_rate": 2.718976864102042e-06, "loss": 0.473, "step": 1003500 }, { "epoch": 2.838274871655698, "grad_norm": 1.8730610609054565, "learning_rate": 2.6954188057383662e-06, "loss": 0.4773, "step": 1004000 }, { "epoch": 2.8396883551575187, "grad_norm": 2.114408254623413, "learning_rate": 2.6718607473746903e-06, "loss": 0.4741, "step": 1004500 }, { "epoch": 2.8411018386593394, "grad_norm": 1.7395135164260864, "learning_rate": 2.648302689011014e-06, "loss": 0.4715, "step": 1005000 }, { "epoch": 2.8425153221611597, "grad_norm": 1.9461650848388672, "learning_rate": 2.624744630647338e-06, "loss": 0.4723, "step": 1005500 }, { "epoch": 2.8439288056629803, "grad_norm": 1.9509810209274292, "learning_rate": 2.6011865722836618e-06, "loss": 0.4732, "step": 1006000 }, { "epoch": 2.845342289164801, "grad_norm": 1.8294713497161865, "learning_rate": 2.577628513919986e-06, "loss": 0.479, "step": 1006500 }, { "epoch": 2.8467557726666213, "grad_norm": 1.91237473487854, "learning_rate": 2.5540704555563095e-06, "loss": 0.4752, "step": 1007000 }, { "epoch": 2.848169256168442, "grad_norm": 1.8048672676086426, "learning_rate": 2.5305123971926336e-06, "loss": 0.4733, "step": 1007500 }, { "epoch": 2.8495827396702627, "grad_norm": 2.0593883991241455, "learning_rate": 2.5069543388289573e-06, "loss": 0.474, "step": 1008000 }, { "epoch": 2.850996223172083, "grad_norm": 2.050459146499634, "learning_rate": 2.483396280465281e-06, "loss": 0.4696, "step": 1008500 }, { "epoch": 2.8524097066739036, "grad_norm": 1.9699594974517822, "learning_rate": 2.459838222101605e-06, "loss": 0.4757, "step": 1009000 }, { "epoch": 2.8538231901757243, "grad_norm": 1.6988970041275024, "learning_rate": 2.4362801637379288e-06, "loss": 0.4764, "step": 1009500 }, { "epoch": 2.855236673677545, "grad_norm": 1.9784103631973267, "learning_rate": 2.412722105374253e-06, "loss": 0.4748, "step": 1010000 }, { "epoch": 2.8566501571793657, "grad_norm": 1.8509936332702637, "learning_rate": 2.3891640470105765e-06, "loss": 0.4721, "step": 1010500 }, { "epoch": 2.858063640681186, "grad_norm": 1.9458303451538086, "learning_rate": 2.3656059886469006e-06, "loss": 0.4708, "step": 1011000 }, { "epoch": 2.8594771241830066, "grad_norm": 1.9711313247680664, "learning_rate": 2.3420479302832243e-06, "loss": 0.4706, "step": 1011500 }, { "epoch": 2.8608906076848273, "grad_norm": 1.9398372173309326, "learning_rate": 2.3184898719195484e-06, "loss": 0.4685, "step": 1012000 }, { "epoch": 2.8623040911866475, "grad_norm": 1.883672833442688, "learning_rate": 2.294931813555872e-06, "loss": 0.473, "step": 1012500 }, { "epoch": 2.863717574688468, "grad_norm": 2.1971781253814697, "learning_rate": 2.271373755192196e-06, "loss": 0.4695, "step": 1013000 }, { "epoch": 2.865131058190289, "grad_norm": 1.8358629941940308, "learning_rate": 2.2478156968285203e-06, "loss": 0.4715, "step": 1013500 }, { "epoch": 2.866544541692109, "grad_norm": 2.0145015716552734, "learning_rate": 2.224257638464844e-06, "loss": 0.4712, "step": 1014000 }, { "epoch": 2.86795802519393, "grad_norm": 1.8847311735153198, "learning_rate": 2.200699580101168e-06, "loss": 0.4734, "step": 1014500 }, { "epoch": 2.8693715086957505, "grad_norm": 1.9816179275512695, "learning_rate": 2.1771415217374917e-06, "loss": 0.4738, "step": 1015000 }, { "epoch": 2.870784992197571, "grad_norm": 1.653665542602539, "learning_rate": 2.153583463373816e-06, "loss": 0.471, "step": 1015500 }, { "epoch": 2.872198475699392, "grad_norm": 2.0703837871551514, "learning_rate": 2.1300254050101395e-06, "loss": 0.4709, "step": 1016000 }, { "epoch": 2.873611959201212, "grad_norm": 1.6813734769821167, "learning_rate": 2.1064673466464636e-06, "loss": 0.4738, "step": 1016500 }, { "epoch": 2.875025442703033, "grad_norm": 1.7487956285476685, "learning_rate": 2.0829092882827873e-06, "loss": 0.4742, "step": 1017000 }, { "epoch": 2.8764389262048535, "grad_norm": 1.8546046018600464, "learning_rate": 2.0593512299191114e-06, "loss": 0.4722, "step": 1017500 }, { "epoch": 2.8778524097066738, "grad_norm": 1.9214463233947754, "learning_rate": 2.035793171555435e-06, "loss": 0.4753, "step": 1018000 }, { "epoch": 2.8792658932084945, "grad_norm": 1.774422526359558, "learning_rate": 2.0122351131917587e-06, "loss": 0.4725, "step": 1018500 }, { "epoch": 2.880679376710315, "grad_norm": 1.785498857498169, "learning_rate": 1.988677054828083e-06, "loss": 0.4746, "step": 1019000 }, { "epoch": 2.8820928602121354, "grad_norm": 2.0067570209503174, "learning_rate": 1.9651189964644065e-06, "loss": 0.4747, "step": 1019500 }, { "epoch": 2.883506343713956, "grad_norm": 1.6696093082427979, "learning_rate": 1.9415609381007306e-06, "loss": 0.4757, "step": 1020000 }, { "epoch": 2.8849198272157768, "grad_norm": 1.8877017498016357, "learning_rate": 1.9180028797370543e-06, "loss": 0.4704, "step": 1020500 }, { "epoch": 2.8863333107175975, "grad_norm": 2.1326797008514404, "learning_rate": 1.8944448213733784e-06, "loss": 0.4767, "step": 1021000 }, { "epoch": 2.8877467942194177, "grad_norm": 2.116431951522827, "learning_rate": 1.870886763009702e-06, "loss": 0.4698, "step": 1021500 }, { "epoch": 2.8891602777212384, "grad_norm": 2.04670786857605, "learning_rate": 1.8473287046460261e-06, "loss": 0.4752, "step": 1022000 }, { "epoch": 2.890573761223059, "grad_norm": 2.051900625228882, "learning_rate": 1.8237706462823498e-06, "loss": 0.4745, "step": 1022500 }, { "epoch": 2.89198724472488, "grad_norm": 1.9148057699203491, "learning_rate": 1.8002125879186739e-06, "loss": 0.472, "step": 1023000 }, { "epoch": 2.8934007282267, "grad_norm": 1.8630082607269287, "learning_rate": 1.776654529554998e-06, "loss": 0.475, "step": 1023500 }, { "epoch": 2.8948142117285207, "grad_norm": 2.101923942565918, "learning_rate": 1.7530964711913217e-06, "loss": 0.4697, "step": 1024000 }, { "epoch": 2.8962276952303414, "grad_norm": 1.6232861280441284, "learning_rate": 1.7295384128276458e-06, "loss": 0.472, "step": 1024500 }, { "epoch": 2.8976411787321616, "grad_norm": 2.178830146789551, "learning_rate": 1.7059803544639694e-06, "loss": 0.4709, "step": 1025000 }, { "epoch": 2.8990546622339823, "grad_norm": 1.7581380605697632, "learning_rate": 1.6824222961002933e-06, "loss": 0.4724, "step": 1025500 }, { "epoch": 2.900468145735803, "grad_norm": 1.8999173641204834, "learning_rate": 1.658864237736617e-06, "loss": 0.4721, "step": 1026000 }, { "epoch": 2.9018816292376233, "grad_norm": 2.1273272037506104, "learning_rate": 1.635306179372941e-06, "loss": 0.4723, "step": 1026500 }, { "epoch": 2.903295112739444, "grad_norm": 2.021346092224121, "learning_rate": 1.6117481210092648e-06, "loss": 0.4754, "step": 1027000 }, { "epoch": 2.9047085962412647, "grad_norm": 2.048670768737793, "learning_rate": 1.5881900626455889e-06, "loss": 0.4727, "step": 1027500 }, { "epoch": 2.9061220797430853, "grad_norm": 1.9126251935958862, "learning_rate": 1.564632004281913e-06, "loss": 0.4704, "step": 1028000 }, { "epoch": 2.907535563244906, "grad_norm": 2.075143337249756, "learning_rate": 1.5410739459182366e-06, "loss": 0.4714, "step": 1028500 }, { "epoch": 2.9089490467467263, "grad_norm": 1.8982406854629517, "learning_rate": 1.5175158875545605e-06, "loss": 0.4716, "step": 1029000 }, { "epoch": 2.910362530248547, "grad_norm": 1.919345498085022, "learning_rate": 1.4939578291908844e-06, "loss": 0.4745, "step": 1029500 }, { "epoch": 2.9117760137503677, "grad_norm": 1.932414174079895, "learning_rate": 1.4703997708272083e-06, "loss": 0.4738, "step": 1030000 }, { "epoch": 2.913189497252188, "grad_norm": 1.8894630670547485, "learning_rate": 1.4468417124635322e-06, "loss": 0.4712, "step": 1030500 }, { "epoch": 2.9146029807540086, "grad_norm": 1.809396743774414, "learning_rate": 1.423283654099856e-06, "loss": 0.4749, "step": 1031000 }, { "epoch": 2.9160164642558293, "grad_norm": 2.016097068786621, "learning_rate": 1.39972559573618e-06, "loss": 0.4698, "step": 1031500 }, { "epoch": 2.9174299477576495, "grad_norm": 2.073176622390747, "learning_rate": 1.3761675373725038e-06, "loss": 0.4692, "step": 1032000 }, { "epoch": 2.91884343125947, "grad_norm": 1.9388052225112915, "learning_rate": 1.3526094790088277e-06, "loss": 0.4724, "step": 1032500 }, { "epoch": 2.920256914761291, "grad_norm": 1.877970576286316, "learning_rate": 1.3290514206451516e-06, "loss": 0.4748, "step": 1033000 }, { "epoch": 2.9216703982631116, "grad_norm": 1.8005867004394531, "learning_rate": 1.3054933622814755e-06, "loss": 0.4722, "step": 1033500 }, { "epoch": 2.9230838817649323, "grad_norm": 1.8535301685333252, "learning_rate": 1.2819353039177994e-06, "loss": 0.4718, "step": 1034000 }, { "epoch": 2.9244973652667525, "grad_norm": 1.9982821941375732, "learning_rate": 1.2583772455541233e-06, "loss": 0.4692, "step": 1034500 }, { "epoch": 2.925910848768573, "grad_norm": 1.9419136047363281, "learning_rate": 1.2348191871904472e-06, "loss": 0.4709, "step": 1035000 }, { "epoch": 2.927324332270394, "grad_norm": 1.8224635124206543, "learning_rate": 1.211261128826771e-06, "loss": 0.4723, "step": 1035500 }, { "epoch": 2.928737815772214, "grad_norm": 1.957592487335205, "learning_rate": 1.187703070463095e-06, "loss": 0.4729, "step": 1036000 }, { "epoch": 2.930151299274035, "grad_norm": 1.8880985975265503, "learning_rate": 1.1641450120994188e-06, "loss": 0.477, "step": 1036500 }, { "epoch": 2.9315647827758555, "grad_norm": 2.004373550415039, "learning_rate": 1.1405869537357427e-06, "loss": 0.4728, "step": 1037000 }, { "epoch": 2.9329782662776758, "grad_norm": 1.701002836227417, "learning_rate": 1.1170288953720666e-06, "loss": 0.4719, "step": 1037500 }, { "epoch": 2.9343917497794965, "grad_norm": 2.0610008239746094, "learning_rate": 1.0934708370083905e-06, "loss": 0.4723, "step": 1038000 }, { "epoch": 2.935805233281317, "grad_norm": 1.9984381198883057, "learning_rate": 1.0699127786447144e-06, "loss": 0.475, "step": 1038500 }, { "epoch": 2.937218716783138, "grad_norm": 2.057466983795166, "learning_rate": 1.0463547202810382e-06, "loss": 0.475, "step": 1039000 }, { "epoch": 2.9386322002849585, "grad_norm": 1.8223023414611816, "learning_rate": 1.0227966619173621e-06, "loss": 0.4708, "step": 1039500 }, { "epoch": 2.9400456837867788, "grad_norm": 1.695405125617981, "learning_rate": 9.99238603553686e-07, "loss": 0.4716, "step": 1040000 }, { "epoch": 2.9414591672885995, "grad_norm": 1.9580937623977661, "learning_rate": 9.7568054519001e-07, "loss": 0.4727, "step": 1040500 }, { "epoch": 2.94287265079042, "grad_norm": 1.91705322265625, "learning_rate": 9.521224868263339e-07, "loss": 0.4745, "step": 1041000 }, { "epoch": 2.9442861342922404, "grad_norm": 1.9498082399368286, "learning_rate": 9.285644284626577e-07, "loss": 0.4695, "step": 1041500 }, { "epoch": 2.945699617794061, "grad_norm": 2.0150201320648193, "learning_rate": 9.050063700989816e-07, "loss": 0.4781, "step": 1042000 }, { "epoch": 2.9471131012958818, "grad_norm": 1.8534653186798096, "learning_rate": 8.814483117353054e-07, "loss": 0.4733, "step": 1042500 }, { "epoch": 2.948526584797702, "grad_norm": 1.8695961236953735, "learning_rate": 8.578902533716293e-07, "loss": 0.4723, "step": 1043000 }, { "epoch": 2.9499400682995227, "grad_norm": 1.780969262123108, "learning_rate": 8.343321950079532e-07, "loss": 0.476, "step": 1043500 }, { "epoch": 2.9513535518013434, "grad_norm": 1.9894623756408691, "learning_rate": 8.107741366442771e-07, "loss": 0.4688, "step": 1044000 }, { "epoch": 2.952767035303164, "grad_norm": 1.902683138847351, "learning_rate": 7.872160782806011e-07, "loss": 0.4733, "step": 1044500 }, { "epoch": 2.954180518804985, "grad_norm": 1.7310363054275513, "learning_rate": 7.636580199169249e-07, "loss": 0.473, "step": 1045000 }, { "epoch": 2.955594002306805, "grad_norm": 1.9313238859176636, "learning_rate": 7.400999615532488e-07, "loss": 0.4718, "step": 1045500 }, { "epoch": 2.9570074858086257, "grad_norm": 1.8468340635299683, "learning_rate": 7.165419031895726e-07, "loss": 0.4739, "step": 1046000 }, { "epoch": 2.9584209693104464, "grad_norm": 1.774349331855774, "learning_rate": 6.929838448258965e-07, "loss": 0.4681, "step": 1046500 }, { "epoch": 2.9598344528122666, "grad_norm": 1.8642592430114746, "learning_rate": 6.694257864622204e-07, "loss": 0.4751, "step": 1047000 }, { "epoch": 2.9612479363140873, "grad_norm": 2.024841070175171, "learning_rate": 6.458677280985443e-07, "loss": 0.4735, "step": 1047500 }, { "epoch": 2.962661419815908, "grad_norm": 2.05379319190979, "learning_rate": 6.223096697348683e-07, "loss": 0.4741, "step": 1048000 }, { "epoch": 2.9640749033177283, "grad_norm": 1.6738313436508179, "learning_rate": 5.987516113711921e-07, "loss": 0.4706, "step": 1048500 }, { "epoch": 2.965488386819549, "grad_norm": 1.935042381286621, "learning_rate": 5.75193553007516e-07, "loss": 0.4722, "step": 1049000 }, { "epoch": 2.9669018703213696, "grad_norm": 1.8947490453720093, "learning_rate": 5.516354946438398e-07, "loss": 0.4691, "step": 1049500 }, { "epoch": 2.9683153538231903, "grad_norm": 2.040578842163086, "learning_rate": 5.280774362801637e-07, "loss": 0.4681, "step": 1050000 }, { "epoch": 2.9697288373250106, "grad_norm": 2.0417420864105225, "learning_rate": 5.045193779164877e-07, "loss": 0.4704, "step": 1050500 }, { "epoch": 2.9711423208268313, "grad_norm": 1.6592463254928589, "learning_rate": 4.809613195528115e-07, "loss": 0.4725, "step": 1051000 }, { "epoch": 2.972555804328652, "grad_norm": 1.9766762256622314, "learning_rate": 4.5740326118913545e-07, "loss": 0.4703, "step": 1051500 }, { "epoch": 2.9739692878304727, "grad_norm": 1.8470534086227417, "learning_rate": 4.338452028254593e-07, "loss": 0.4715, "step": 1052000 }, { "epoch": 2.975382771332293, "grad_norm": 2.0506951808929443, "learning_rate": 4.1028714446178317e-07, "loss": 0.4741, "step": 1052500 }, { "epoch": 2.9767962548341136, "grad_norm": 1.9688812494277954, "learning_rate": 3.8672908609810705e-07, "loss": 0.4724, "step": 1053000 }, { "epoch": 2.9782097383359343, "grad_norm": 1.7794991731643677, "learning_rate": 3.63171027734431e-07, "loss": 0.4723, "step": 1053500 }, { "epoch": 2.9796232218377545, "grad_norm": 2.0863656997680664, "learning_rate": 3.396129693707549e-07, "loss": 0.4712, "step": 1054000 }, { "epoch": 2.981036705339575, "grad_norm": 2.180192232131958, "learning_rate": 3.1605491100707876e-07, "loss": 0.4724, "step": 1054500 }, { "epoch": 2.982450188841396, "grad_norm": 1.8998346328735352, "learning_rate": 2.9249685264340265e-07, "loss": 0.4706, "step": 1055000 }, { "epoch": 2.983863672343216, "grad_norm": 1.7859346866607666, "learning_rate": 2.689387942797265e-07, "loss": 0.4727, "step": 1055500 }, { "epoch": 2.985277155845037, "grad_norm": 1.8241981267929077, "learning_rate": 2.453807359160504e-07, "loss": 0.4685, "step": 1056000 }, { "epoch": 2.9866906393468575, "grad_norm": 1.84385347366333, "learning_rate": 2.2182267755237428e-07, "loss": 0.4691, "step": 1056500 }, { "epoch": 2.988104122848678, "grad_norm": 1.8533270359039307, "learning_rate": 1.982646191886982e-07, "loss": 0.4697, "step": 1057000 }, { "epoch": 2.989517606350499, "grad_norm": 1.9469895362854004, "learning_rate": 1.7470656082502205e-07, "loss": 0.4678, "step": 1057500 }, { "epoch": 2.990931089852319, "grad_norm": 2.0735573768615723, "learning_rate": 1.5114850246134594e-07, "loss": 0.4744, "step": 1058000 }, { "epoch": 2.99234457335414, "grad_norm": 1.8653805255889893, "learning_rate": 1.2759044409766985e-07, "loss": 0.476, "step": 1058500 }, { "epoch": 2.9937580568559605, "grad_norm": 1.9854600429534912, "learning_rate": 1.0403238573399372e-07, "loss": 0.4749, "step": 1059000 }, { "epoch": 2.9951715403577808, "grad_norm": 2.0510377883911133, "learning_rate": 8.047432737031761e-08, "loss": 0.4686, "step": 1059500 }, { "epoch": 2.9965850238596015, "grad_norm": 2.041825294494629, "learning_rate": 5.691626900664149e-08, "loss": 0.4725, "step": 1060000 }, { "epoch": 2.997998507361422, "grad_norm": 1.8965306282043457, "learning_rate": 3.3358210642965373e-08, "loss": 0.4682, "step": 1060500 }, { "epoch": 2.9994119908632424, "grad_norm": 1.8758937120437622, "learning_rate": 9.800152279289264e-09, "loss": 0.4728, "step": 1061000 }, { "epoch": 3.0, "step": 1061208, "total_flos": 8.947355434270848e+18, "train_loss": 0.5702250743976136, "train_runtime": 431071.4094, "train_samples_per_second": 78.777, "train_steps_per_second": 2.462 } ], "logging_steps": 500, "max_steps": 1061208, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.947355434270848e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }