|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.352018816150529, |
|
"eval_steps": 700, |
|
"global_step": 12000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.001960015680125441, |
|
"grad_norm": 7.928689002990723, |
|
"learning_rate": 8.333333333333334e-08, |
|
"loss": 1.4388, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.003920031360250882, |
|
"grad_norm": 9.54440689086914, |
|
"learning_rate": 1.6666666666666668e-07, |
|
"loss": 1.4362, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.005880047040376323, |
|
"grad_norm": 9.008377075195312, |
|
"learning_rate": 2.5000000000000004e-07, |
|
"loss": 1.4445, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.007840062720501764, |
|
"grad_norm": 7.416823863983154, |
|
"learning_rate": 3.3333333333333335e-07, |
|
"loss": 1.4217, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.009800078400627205, |
|
"grad_norm": 7.977886199951172, |
|
"learning_rate": 4.1666666666666667e-07, |
|
"loss": 1.4243, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.011760094080752646, |
|
"grad_norm": 8.24797248840332, |
|
"learning_rate": 5.000000000000001e-07, |
|
"loss": 1.3976, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.013720109760878087, |
|
"grad_norm": 7.240636348724365, |
|
"learning_rate": 5.833333333333334e-07, |
|
"loss": 1.3853, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.01568012544100353, |
|
"grad_norm": 5.709108352661133, |
|
"learning_rate": 6.666666666666667e-07, |
|
"loss": 1.3353, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.01764014112112897, |
|
"grad_norm": 9.35693073272705, |
|
"learning_rate": 7.5e-07, |
|
"loss": 1.3051, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.01960015680125441, |
|
"grad_norm": 47.39125442504883, |
|
"learning_rate": 8.333333333333333e-07, |
|
"loss": 1.265, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02156017248137985, |
|
"grad_norm": 5.6352434158325195, |
|
"learning_rate": 9.166666666666666e-07, |
|
"loss": 1.2583, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.023520188161505293, |
|
"grad_norm": 6.022407531738281, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 1.2398, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.025480203841630734, |
|
"grad_norm": 6.143012046813965, |
|
"learning_rate": 1.0833333333333335e-06, |
|
"loss": 1.2237, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.027440219521756175, |
|
"grad_norm": 6.173267841339111, |
|
"learning_rate": 1.1666666666666668e-06, |
|
"loss": 1.2246, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.029400235201881616, |
|
"grad_norm": 5.4503092765808105, |
|
"learning_rate": 1.25e-06, |
|
"loss": 1.1993, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.03136025088200706, |
|
"grad_norm": 17.596582412719727, |
|
"learning_rate": 1.3333333333333334e-06, |
|
"loss": 1.1889, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.033320266562132494, |
|
"grad_norm": 6.131169319152832, |
|
"learning_rate": 1.4166666666666667e-06, |
|
"loss": 1.1884, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.03528028224225794, |
|
"grad_norm": 5.450586795806885, |
|
"learning_rate": 1.5e-06, |
|
"loss": 1.1622, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.03724029792238338, |
|
"grad_norm": 14.713933944702148, |
|
"learning_rate": 1.5833333333333333e-06, |
|
"loss": 1.1675, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.03920031360250882, |
|
"grad_norm": 8.510261535644531, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 1.1584, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.04116032928263426, |
|
"grad_norm": 6.9118971824646, |
|
"learning_rate": 1.75e-06, |
|
"loss": 1.1696, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.0431203449627597, |
|
"grad_norm": 5.986424922943115, |
|
"learning_rate": 1.8333333333333333e-06, |
|
"loss": 1.1734, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.04508036064288514, |
|
"grad_norm": 7.179569244384766, |
|
"learning_rate": 1.916666666666667e-06, |
|
"loss": 1.1563, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.047040376323010585, |
|
"grad_norm": 7.15562629699707, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.1443, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.04900039200313602, |
|
"grad_norm": 5.163909912109375, |
|
"learning_rate": 2.0833333333333334e-06, |
|
"loss": 1.1567, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.05096040768326147, |
|
"grad_norm": 5.861210346221924, |
|
"learning_rate": 2.166666666666667e-06, |
|
"loss": 1.1596, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.052920423363386905, |
|
"grad_norm": 5.1880292892456055, |
|
"learning_rate": 2.25e-06, |
|
"loss": 1.1428, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.05488043904351235, |
|
"grad_norm": 6.203003406524658, |
|
"learning_rate": 2.3333333333333336e-06, |
|
"loss": 1.1463, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.05684045472363779, |
|
"grad_norm": 6.191583156585693, |
|
"learning_rate": 2.4166666666666667e-06, |
|
"loss": 1.1538, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.05880047040376323, |
|
"grad_norm": 9.0908203125, |
|
"learning_rate": 2.5e-06, |
|
"loss": 1.1277, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.06076048608388867, |
|
"grad_norm": 5.685515880584717, |
|
"learning_rate": 2.5833333333333337e-06, |
|
"loss": 1.1471, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.06272050176401411, |
|
"grad_norm": 6.010474681854248, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 1.1232, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.06468051744413955, |
|
"grad_norm": 9.0894193649292, |
|
"learning_rate": 2.7500000000000004e-06, |
|
"loss": 1.1286, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.06664053312426499, |
|
"grad_norm": 5.936020851135254, |
|
"learning_rate": 2.8333333333333335e-06, |
|
"loss": 1.1116, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.06860054880439044, |
|
"grad_norm": 6.055352210998535, |
|
"learning_rate": 2.916666666666667e-06, |
|
"loss": 1.1167, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.07056056448451588, |
|
"grad_norm": 5.160061836242676, |
|
"learning_rate": 3e-06, |
|
"loss": 1.1225, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.07252058016464132, |
|
"grad_norm": 5.748876571655273, |
|
"learning_rate": 3.0833333333333336e-06, |
|
"loss": 1.1298, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.07448059584476675, |
|
"grad_norm": 5.9786057472229, |
|
"learning_rate": 3.1666666666666667e-06, |
|
"loss": 1.1394, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.0764406115248922, |
|
"grad_norm": 5.474339008331299, |
|
"learning_rate": 3.2500000000000002e-06, |
|
"loss": 1.138, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.07840062720501764, |
|
"grad_norm": 6.028172016143799, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 1.1243, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.08036064288514308, |
|
"grad_norm": 11.877293586730957, |
|
"learning_rate": 3.416666666666667e-06, |
|
"loss": 1.1086, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.08232065856526852, |
|
"grad_norm": 6.457036972045898, |
|
"learning_rate": 3.5e-06, |
|
"loss": 1.1113, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.08428067424539397, |
|
"grad_norm": 8.589061737060547, |
|
"learning_rate": 3.5833333333333335e-06, |
|
"loss": 1.1207, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.0862406899255194, |
|
"grad_norm": 6.367908000946045, |
|
"learning_rate": 3.6666666666666666e-06, |
|
"loss": 1.1192, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.08820070560564484, |
|
"grad_norm": 6.087027549743652, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 1.114, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.09016072128577028, |
|
"grad_norm": 6.365817546844482, |
|
"learning_rate": 3.833333333333334e-06, |
|
"loss": 1.1104, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.09212073696589573, |
|
"grad_norm": 6.38438606262207, |
|
"learning_rate": 3.916666666666667e-06, |
|
"loss": 1.1222, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.09408075264602117, |
|
"grad_norm": 5.983357906341553, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.1082, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.09604076832614661, |
|
"grad_norm": 7.514024257659912, |
|
"learning_rate": 4.083333333333334e-06, |
|
"loss": 1.1305, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.09800078400627205, |
|
"grad_norm": 5.6476922035217285, |
|
"learning_rate": 4.166666666666667e-06, |
|
"loss": 1.1183, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.0999607996863975, |
|
"grad_norm": 6.5463128089904785, |
|
"learning_rate": 4.25e-06, |
|
"loss": 1.1126, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.10192081536652293, |
|
"grad_norm": 6.156862258911133, |
|
"learning_rate": 4.333333333333334e-06, |
|
"loss": 1.1161, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.10388083104664837, |
|
"grad_norm": 5.682535648345947, |
|
"learning_rate": 4.416666666666667e-06, |
|
"loss": 1.1096, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.10584084672677381, |
|
"grad_norm": 7.1881489753723145, |
|
"learning_rate": 4.5e-06, |
|
"loss": 1.1136, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.10780086240689926, |
|
"grad_norm": 5.990985870361328, |
|
"learning_rate": 4.583333333333333e-06, |
|
"loss": 1.1224, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.1097608780870247, |
|
"grad_norm": 5.474586009979248, |
|
"learning_rate": 4.666666666666667e-06, |
|
"loss": 1.1078, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.11172089376715014, |
|
"grad_norm": 5.7349371910095215, |
|
"learning_rate": 4.75e-06, |
|
"loss": 1.1238, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.11368090944727557, |
|
"grad_norm": 7.513430595397949, |
|
"learning_rate": 4.833333333333333e-06, |
|
"loss": 1.1158, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.11564092512740103, |
|
"grad_norm": 5.410061836242676, |
|
"learning_rate": 4.9166666666666665e-06, |
|
"loss": 1.1102, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.11760094080752646, |
|
"grad_norm": 6.409933567047119, |
|
"learning_rate": 5e-06, |
|
"loss": 1.1035, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.1195609564876519, |
|
"grad_norm": 14.141221046447754, |
|
"learning_rate": 5.0833333333333335e-06, |
|
"loss": 1.1066, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.12152097216777734, |
|
"grad_norm": 5.547974586486816, |
|
"learning_rate": 5.1666666666666675e-06, |
|
"loss": 1.1218, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.12348098784790279, |
|
"grad_norm": 5.946651935577393, |
|
"learning_rate": 5.2500000000000006e-06, |
|
"loss": 1.112, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.12544100352802823, |
|
"grad_norm": 6.724623680114746, |
|
"learning_rate": 5.333333333333334e-06, |
|
"loss": 1.0986, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.12740101920815367, |
|
"grad_norm": 8.897869110107422, |
|
"learning_rate": 5.416666666666667e-06, |
|
"loss": 1.1077, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.1293610348882791, |
|
"grad_norm": 6.75730037689209, |
|
"learning_rate": 5.500000000000001e-06, |
|
"loss": 1.1018, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.13132105056840454, |
|
"grad_norm": 5.3133111000061035, |
|
"learning_rate": 5.583333333333334e-06, |
|
"loss": 1.1089, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.13328106624852998, |
|
"grad_norm": 5.644856929779053, |
|
"learning_rate": 5.666666666666667e-06, |
|
"loss": 1.1159, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.13524108192865542, |
|
"grad_norm": 5.264536380767822, |
|
"learning_rate": 5.75e-06, |
|
"loss": 1.1068, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.13720109760878088, |
|
"grad_norm": 5.970139026641846, |
|
"learning_rate": 5.833333333333334e-06, |
|
"loss": 1.1214, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.13720109760878088, |
|
"eval_loss": 1.0996023416519165, |
|
"eval_runtime": 14.1011, |
|
"eval_samples_per_second": 46.379, |
|
"eval_steps_per_second": 5.815, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.13916111328890632, |
|
"grad_norm": 5.035984039306641, |
|
"learning_rate": 5.916666666666667e-06, |
|
"loss": 1.1078, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.14112112896903176, |
|
"grad_norm": 5.657273292541504, |
|
"learning_rate": 6e-06, |
|
"loss": 1.0998, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.1430811446491572, |
|
"grad_norm": 5.413987636566162, |
|
"learning_rate": 6.083333333333333e-06, |
|
"loss": 1.1023, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.14504116032928263, |
|
"grad_norm": 7.05068302154541, |
|
"learning_rate": 6.166666666666667e-06, |
|
"loss": 1.1053, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.14700117600940807, |
|
"grad_norm": 6.170547008514404, |
|
"learning_rate": 6.25e-06, |
|
"loss": 1.0998, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.1489611916895335, |
|
"grad_norm": 5.373660087585449, |
|
"learning_rate": 6.333333333333333e-06, |
|
"loss": 1.0899, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.15092120736965894, |
|
"grad_norm": 6.981970310211182, |
|
"learning_rate": 6.416666666666667e-06, |
|
"loss": 1.113, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.1528812230497844, |
|
"grad_norm": 4.918264865875244, |
|
"learning_rate": 6.5000000000000004e-06, |
|
"loss": 1.0884, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.15484123872990985, |
|
"grad_norm": 13.080545425415039, |
|
"learning_rate": 6.5833333333333335e-06, |
|
"loss": 1.1012, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.15680125441003528, |
|
"grad_norm": 5.957456111907959, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 1.1044, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.15876127009016072, |
|
"grad_norm": 5.442764759063721, |
|
"learning_rate": 6.750000000000001e-06, |
|
"loss": 1.1051, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.16072128577028616, |
|
"grad_norm": 5.379378318786621, |
|
"learning_rate": 6.833333333333334e-06, |
|
"loss": 1.1092, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.1626813014504116, |
|
"grad_norm": 6.137138366699219, |
|
"learning_rate": 6.916666666666667e-06, |
|
"loss": 1.0835, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.16464131713053703, |
|
"grad_norm": 5.297956943511963, |
|
"learning_rate": 7e-06, |
|
"loss": 1.1089, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.16660133281066247, |
|
"grad_norm": 5.63748025894165, |
|
"learning_rate": 7.083333333333335e-06, |
|
"loss": 1.1024, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.16856134849078794, |
|
"grad_norm": 5.432644844055176, |
|
"learning_rate": 7.166666666666667e-06, |
|
"loss": 1.0712, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.17052136417091338, |
|
"grad_norm": 5.4105305671691895, |
|
"learning_rate": 7.25e-06, |
|
"loss": 1.1122, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.1724813798510388, |
|
"grad_norm": 6.026330471038818, |
|
"learning_rate": 7.333333333333333e-06, |
|
"loss": 1.1049, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.17444139553116425, |
|
"grad_norm": 6.143797397613525, |
|
"learning_rate": 7.416666666666668e-06, |
|
"loss": 1.1038, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.1764014112112897, |
|
"grad_norm": 5.558448791503906, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 1.095, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.17836142689141513, |
|
"grad_norm": 6.965857028961182, |
|
"learning_rate": 7.583333333333333e-06, |
|
"loss": 1.0979, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.18032144257154056, |
|
"grad_norm": 6.795146942138672, |
|
"learning_rate": 7.666666666666667e-06, |
|
"loss": 1.103, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.182281458251666, |
|
"grad_norm": 5.742541313171387, |
|
"learning_rate": 7.75e-06, |
|
"loss": 1.1031, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.18424147393179147, |
|
"grad_norm": 5.67035436630249, |
|
"learning_rate": 7.833333333333333e-06, |
|
"loss": 1.1018, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.1862014896119169, |
|
"grad_norm": 8.501360893249512, |
|
"learning_rate": 7.916666666666667e-06, |
|
"loss": 1.0916, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.18816150529204234, |
|
"grad_norm": 5.092253684997559, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.0886, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.19012152097216778, |
|
"grad_norm": 6.499626636505127, |
|
"learning_rate": 8.083333333333334e-06, |
|
"loss": 1.1008, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.19208153665229322, |
|
"grad_norm": 5.9327216148376465, |
|
"learning_rate": 8.166666666666668e-06, |
|
"loss": 1.0911, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.19404155233241865, |
|
"grad_norm": 5.990231990814209, |
|
"learning_rate": 8.25e-06, |
|
"loss": 1.0955, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.1960015680125441, |
|
"grad_norm": 32.17470932006836, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 1.1087, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.19796158369266953, |
|
"grad_norm": 5.19512414932251, |
|
"learning_rate": 8.416666666666667e-06, |
|
"loss": 1.1302, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.199921599372795, |
|
"grad_norm": 5.765759468078613, |
|
"learning_rate": 8.5e-06, |
|
"loss": 1.1089, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.20188161505292043, |
|
"grad_norm": 5.5894927978515625, |
|
"learning_rate": 8.583333333333333e-06, |
|
"loss": 1.0914, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.20384163073304587, |
|
"grad_norm": 6.387049198150635, |
|
"learning_rate": 8.666666666666668e-06, |
|
"loss": 1.095, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.2058016464131713, |
|
"grad_norm": 5.629969596862793, |
|
"learning_rate": 8.750000000000001e-06, |
|
"loss": 1.0999, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.20776166209329675, |
|
"grad_norm": 22.919910430908203, |
|
"learning_rate": 8.833333333333334e-06, |
|
"loss": 1.096, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.20972167777342218, |
|
"grad_norm": 5.267335414886475, |
|
"learning_rate": 8.916666666666667e-06, |
|
"loss": 1.1006, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.21168169345354762, |
|
"grad_norm": 53.39712905883789, |
|
"learning_rate": 9e-06, |
|
"loss": 1.106, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.21364170913367306, |
|
"grad_norm": 5.858781814575195, |
|
"learning_rate": 9.083333333333333e-06, |
|
"loss": 1.2549, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.21560172481379852, |
|
"grad_norm": 6.5703935623168945, |
|
"learning_rate": 9.166666666666666e-06, |
|
"loss": 1.094, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.21756174049392396, |
|
"grad_norm": 5.414449214935303, |
|
"learning_rate": 9.250000000000001e-06, |
|
"loss": 1.0891, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.2195217561740494, |
|
"grad_norm": 7.185405731201172, |
|
"learning_rate": 9.333333333333334e-06, |
|
"loss": 1.089, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.22148177185417484, |
|
"grad_norm": 6.4889092445373535, |
|
"learning_rate": 9.416666666666667e-06, |
|
"loss": 1.0874, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.22344178753430027, |
|
"grad_norm": 6.4478759765625, |
|
"learning_rate": 9.5e-06, |
|
"loss": 1.1111, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.2254018032144257, |
|
"grad_norm": 6.49618673324585, |
|
"learning_rate": 9.583333333333335e-06, |
|
"loss": 1.1001, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.22736181889455115, |
|
"grad_norm": 7.893542766571045, |
|
"learning_rate": 9.666666666666667e-06, |
|
"loss": 1.097, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.2293218345746766, |
|
"grad_norm": 11.165901184082031, |
|
"learning_rate": 9.75e-06, |
|
"loss": 1.092, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.23128185025480205, |
|
"grad_norm": 21.967939376831055, |
|
"learning_rate": 9.833333333333333e-06, |
|
"loss": 1.1216, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.2332418659349275, |
|
"grad_norm": 14.317709922790527, |
|
"learning_rate": 9.916666666666668e-06, |
|
"loss": 1.1047, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.23520188161505293, |
|
"grad_norm": 6.240993499755859, |
|
"learning_rate": 1e-05, |
|
"loss": 1.1045, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.23716189729517836, |
|
"grad_norm": 5.804283142089844, |
|
"learning_rate": 9.990740740740741e-06, |
|
"loss": 1.0933, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.2391219129753038, |
|
"grad_norm": 5.778935432434082, |
|
"learning_rate": 9.981481481481482e-06, |
|
"loss": 1.1098, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.24108192865542924, |
|
"grad_norm": 6.391726016998291, |
|
"learning_rate": 9.972222222222224e-06, |
|
"loss": 1.0993, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.24304194433555468, |
|
"grad_norm": 4.967405319213867, |
|
"learning_rate": 9.962962962962964e-06, |
|
"loss": 1.0835, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.24500196001568011, |
|
"grad_norm": 11.337072372436523, |
|
"learning_rate": 9.953703703703704e-06, |
|
"loss": 1.1057, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.24696197569580558, |
|
"grad_norm": 5.182584285736084, |
|
"learning_rate": 9.944444444444445e-06, |
|
"loss": 1.0913, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.24892199137593102, |
|
"grad_norm": 6.305624961853027, |
|
"learning_rate": 9.935185185185185e-06, |
|
"loss": 1.0963, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.25088200705605646, |
|
"grad_norm": 5.263504981994629, |
|
"learning_rate": 9.925925925925927e-06, |
|
"loss": 1.1097, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.2528420227361819, |
|
"grad_norm": 5.2210493087768555, |
|
"learning_rate": 9.916666666666668e-06, |
|
"loss": 1.0934, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.25480203841630733, |
|
"grad_norm": 6.707633972167969, |
|
"learning_rate": 9.907407407407408e-06, |
|
"loss": 1.1049, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.25676205409643277, |
|
"grad_norm": 5.195355415344238, |
|
"learning_rate": 9.898148148148148e-06, |
|
"loss": 1.0866, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.2587220697765582, |
|
"grad_norm": 7.137194633483887, |
|
"learning_rate": 9.88888888888889e-06, |
|
"loss": 1.1032, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.26068208545668364, |
|
"grad_norm": 5.271111488342285, |
|
"learning_rate": 9.87962962962963e-06, |
|
"loss": 1.0925, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.2626421011368091, |
|
"grad_norm": 6.781525135040283, |
|
"learning_rate": 9.870370370370371e-06, |
|
"loss": 1.0966, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.2646021168169345, |
|
"grad_norm": 8.270143508911133, |
|
"learning_rate": 9.861111111111112e-06, |
|
"loss": 1.0991, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.26656213249705996, |
|
"grad_norm": 6.368267059326172, |
|
"learning_rate": 9.851851851851852e-06, |
|
"loss": 1.1005, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.2685221481771854, |
|
"grad_norm": 5.750402927398682, |
|
"learning_rate": 9.842592592592594e-06, |
|
"loss": 1.0907, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.27048216385731083, |
|
"grad_norm": 5.706563472747803, |
|
"learning_rate": 9.833333333333333e-06, |
|
"loss": 1.1133, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.2724421795374363, |
|
"grad_norm": 5.816508769989014, |
|
"learning_rate": 9.824074074074075e-06, |
|
"loss": 1.1074, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.27440219521756176, |
|
"grad_norm": 4.7041449546813965, |
|
"learning_rate": 9.814814814814815e-06, |
|
"loss": 1.0944, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.27440219521756176, |
|
"eval_loss": 1.089659571647644, |
|
"eval_runtime": 14.1203, |
|
"eval_samples_per_second": 46.316, |
|
"eval_steps_per_second": 5.807, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.2763622108976872, |
|
"grad_norm": 4.5806355476379395, |
|
"learning_rate": 9.805555555555556e-06, |
|
"loss": 1.0868, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.27832222657781264, |
|
"grad_norm": 6.664804458618164, |
|
"learning_rate": 9.796296296296298e-06, |
|
"loss": 1.1031, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.2802822422579381, |
|
"grad_norm": 5.814957141876221, |
|
"learning_rate": 9.787037037037038e-06, |
|
"loss": 1.0818, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.2822422579380635, |
|
"grad_norm": 11.043229103088379, |
|
"learning_rate": 9.777777777777779e-06, |
|
"loss": 1.1147, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.28420227361818895, |
|
"grad_norm": 5.907972812652588, |
|
"learning_rate": 9.768518518518519e-06, |
|
"loss": 1.0949, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.2861622892983144, |
|
"grad_norm": 8.000860214233398, |
|
"learning_rate": 9.759259259259261e-06, |
|
"loss": 1.107, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.2881223049784398, |
|
"grad_norm": 5.279980182647705, |
|
"learning_rate": 9.75e-06, |
|
"loss": 1.0988, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.29008232065856526, |
|
"grad_norm": 6.427779674530029, |
|
"learning_rate": 9.740740740740742e-06, |
|
"loss": 1.0969, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.2920423363386907, |
|
"grad_norm": 5.733342170715332, |
|
"learning_rate": 9.731481481481482e-06, |
|
"loss": 1.0832, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.29400235201881614, |
|
"grad_norm": 5.370787620544434, |
|
"learning_rate": 9.722222222222223e-06, |
|
"loss": 1.0724, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.2959623676989416, |
|
"grad_norm": 4.780156135559082, |
|
"learning_rate": 9.712962962962965e-06, |
|
"loss": 1.0911, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.297922383379067, |
|
"grad_norm": 5.770766258239746, |
|
"learning_rate": 9.703703703703703e-06, |
|
"loss": 1.0913, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.29988239905919245, |
|
"grad_norm": 5.799376487731934, |
|
"learning_rate": 9.694444444444446e-06, |
|
"loss": 1.1072, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.3018424147393179, |
|
"grad_norm": 4.571477890014648, |
|
"learning_rate": 9.685185185185186e-06, |
|
"loss": 1.0977, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.3038024304194434, |
|
"grad_norm": 6.020542144775391, |
|
"learning_rate": 9.675925925925926e-06, |
|
"loss": 1.1056, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.3057624460995688, |
|
"grad_norm": 6.760439395904541, |
|
"learning_rate": 9.666666666666667e-06, |
|
"loss": 1.0874, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.30772246177969426, |
|
"grad_norm": 4.900004863739014, |
|
"learning_rate": 9.657407407407409e-06, |
|
"loss": 1.0946, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.3096824774598197, |
|
"grad_norm": 4.82609748840332, |
|
"learning_rate": 9.64814814814815e-06, |
|
"loss": 1.1017, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.31164249313994513, |
|
"grad_norm": 5.115661144256592, |
|
"learning_rate": 9.63888888888889e-06, |
|
"loss": 1.1073, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.31360250882007057, |
|
"grad_norm": 4.987372875213623, |
|
"learning_rate": 9.62962962962963e-06, |
|
"loss": 1.084, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.315562524500196, |
|
"grad_norm": 5.010407447814941, |
|
"learning_rate": 9.62037037037037e-06, |
|
"loss": 1.1031, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.31752254018032144, |
|
"grad_norm": 4.8331122398376465, |
|
"learning_rate": 9.611111111111112e-06, |
|
"loss": 1.0995, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.3194825558604469, |
|
"grad_norm": 5.5287909507751465, |
|
"learning_rate": 9.601851851851853e-06, |
|
"loss": 1.0884, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.3214425715405723, |
|
"grad_norm": 7.749701023101807, |
|
"learning_rate": 9.592592592592593e-06, |
|
"loss": 1.0786, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.32340258722069776, |
|
"grad_norm": 4.908470153808594, |
|
"learning_rate": 9.583333333333335e-06, |
|
"loss": 1.1049, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.3253626029008232, |
|
"grad_norm": 8.760143280029297, |
|
"learning_rate": 9.574074074074074e-06, |
|
"loss": 1.1, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.32732261858094863, |
|
"grad_norm": 4.7910590171813965, |
|
"learning_rate": 9.564814814814816e-06, |
|
"loss": 1.0936, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.32928263426107407, |
|
"grad_norm": 10.59984302520752, |
|
"learning_rate": 9.555555555555556e-06, |
|
"loss": 1.0888, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.3312426499411995, |
|
"grad_norm": 5.18034029006958, |
|
"learning_rate": 9.546296296296297e-06, |
|
"loss": 1.1015, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.33320266562132494, |
|
"grad_norm": 4.835277557373047, |
|
"learning_rate": 9.537037037037037e-06, |
|
"loss": 1.0934, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.33516268130145044, |
|
"grad_norm": 5.690189838409424, |
|
"learning_rate": 9.527777777777778e-06, |
|
"loss": 1.0981, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.3371226969815759, |
|
"grad_norm": 5.126386642456055, |
|
"learning_rate": 9.51851851851852e-06, |
|
"loss": 1.0731, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.3390827126617013, |
|
"grad_norm": 5.041740894317627, |
|
"learning_rate": 9.50925925925926e-06, |
|
"loss": 1.0822, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.34104272834182675, |
|
"grad_norm": 5.251774787902832, |
|
"learning_rate": 9.5e-06, |
|
"loss": 1.0914, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.3430027440219522, |
|
"grad_norm": 4.976339817047119, |
|
"learning_rate": 9.490740740740741e-06, |
|
"loss": 1.0994, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.3449627597020776, |
|
"grad_norm": 4.93164587020874, |
|
"learning_rate": 9.481481481481483e-06, |
|
"loss": 1.1029, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.34692277538220306, |
|
"grad_norm": 4.964654445648193, |
|
"learning_rate": 9.472222222222223e-06, |
|
"loss": 1.0846, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.3488827910623285, |
|
"grad_norm": 5.763545036315918, |
|
"learning_rate": 9.462962962962964e-06, |
|
"loss": 1.0861, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.35084280674245394, |
|
"grad_norm": 7.256554126739502, |
|
"learning_rate": 9.453703703703704e-06, |
|
"loss": 1.0878, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.3528028224225794, |
|
"grad_norm": 5.118113994598389, |
|
"learning_rate": 9.444444444444445e-06, |
|
"loss": 1.0814, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.3547628381027048, |
|
"grad_norm": 5.370246410369873, |
|
"learning_rate": 9.435185185185187e-06, |
|
"loss": 1.0977, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.35672285378283025, |
|
"grad_norm": 4.952902793884277, |
|
"learning_rate": 9.425925925925925e-06, |
|
"loss": 1.0889, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.3586828694629557, |
|
"grad_norm": 4.842602252960205, |
|
"learning_rate": 9.416666666666667e-06, |
|
"loss": 1.0926, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.3606428851430811, |
|
"grad_norm": 4.805694103240967, |
|
"learning_rate": 9.407407407407408e-06, |
|
"loss": 1.1087, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.36260290082320656, |
|
"grad_norm": 5.138493061065674, |
|
"learning_rate": 9.398148148148148e-06, |
|
"loss": 1.0862, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.364562916503332, |
|
"grad_norm": 7.099350929260254, |
|
"learning_rate": 9.38888888888889e-06, |
|
"loss": 1.0887, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.3665229321834575, |
|
"grad_norm": 5.295989990234375, |
|
"learning_rate": 9.37962962962963e-06, |
|
"loss": 1.0715, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.36848294786358293, |
|
"grad_norm": 5.222052097320557, |
|
"learning_rate": 9.370370370370371e-06, |
|
"loss": 1.0979, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.37044296354370837, |
|
"grad_norm": 4.347951889038086, |
|
"learning_rate": 9.361111111111111e-06, |
|
"loss": 1.1036, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.3724029792238338, |
|
"grad_norm": 5.062961101531982, |
|
"learning_rate": 9.351851851851854e-06, |
|
"loss": 1.0864, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.37436299490395925, |
|
"grad_norm": 5.308873176574707, |
|
"learning_rate": 9.342592592592594e-06, |
|
"loss": 1.0877, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.3763230105840847, |
|
"grad_norm": 4.578522682189941, |
|
"learning_rate": 9.333333333333334e-06, |
|
"loss": 1.0961, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.3782830262642101, |
|
"grad_norm": 4.571235179901123, |
|
"learning_rate": 9.324074074074075e-06, |
|
"loss": 1.0838, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.38024304194433556, |
|
"grad_norm": 4.522839069366455, |
|
"learning_rate": 9.314814814814815e-06, |
|
"loss": 1.0786, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.382203057624461, |
|
"grad_norm": 4.68695068359375, |
|
"learning_rate": 9.305555555555557e-06, |
|
"loss": 1.1146, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.38416307330458643, |
|
"grad_norm": 4.976430416107178, |
|
"learning_rate": 9.296296296296296e-06, |
|
"loss": 1.0937, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.38612308898471187, |
|
"grad_norm": 5.086645126342773, |
|
"learning_rate": 9.287037037037038e-06, |
|
"loss": 1.097, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.3880831046648373, |
|
"grad_norm": 4.946636199951172, |
|
"learning_rate": 9.277777777777778e-06, |
|
"loss": 1.0986, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.39004312034496275, |
|
"grad_norm": 4.823138236999512, |
|
"learning_rate": 9.268518518518519e-06, |
|
"loss": 1.1034, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.3920031360250882, |
|
"grad_norm": 5.26657772064209, |
|
"learning_rate": 9.25925925925926e-06, |
|
"loss": 1.0798, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.3939631517052136, |
|
"grad_norm": 5.231563568115234, |
|
"learning_rate": 9.250000000000001e-06, |
|
"loss": 1.0944, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.39592316738533906, |
|
"grad_norm": 4.791154384613037, |
|
"learning_rate": 9.240740740740742e-06, |
|
"loss": 1.0949, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.3978831830654645, |
|
"grad_norm": 4.691703796386719, |
|
"learning_rate": 9.231481481481482e-06, |
|
"loss": 1.0856, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.39984319874559, |
|
"grad_norm": 7.213362693786621, |
|
"learning_rate": 9.222222222222224e-06, |
|
"loss": 1.0922, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.4018032144257154, |
|
"grad_norm": 6.0655646324157715, |
|
"learning_rate": 9.212962962962963e-06, |
|
"loss": 1.0948, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.40376323010584086, |
|
"grad_norm": 4.249541759490967, |
|
"learning_rate": 9.203703703703705e-06, |
|
"loss": 1.091, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.4057232457859663, |
|
"grad_norm": 4.832751750946045, |
|
"learning_rate": 9.194444444444445e-06, |
|
"loss": 1.0907, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.40768326146609174, |
|
"grad_norm": 5.216608047485352, |
|
"learning_rate": 9.185185185185186e-06, |
|
"loss": 1.0802, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.4096432771462172, |
|
"grad_norm": 5.988987445831299, |
|
"learning_rate": 9.175925925925928e-06, |
|
"loss": 1.0938, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.4116032928263426, |
|
"grad_norm": 4.744123458862305, |
|
"learning_rate": 9.166666666666666e-06, |
|
"loss": 1.0889, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.4116032928263426, |
|
"eval_loss": 1.08055579662323, |
|
"eval_runtime": 14.0694, |
|
"eval_samples_per_second": 46.484, |
|
"eval_steps_per_second": 5.828, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.41356330850646805, |
|
"grad_norm": 5.0730485916137695, |
|
"learning_rate": 9.157407407407409e-06, |
|
"loss": 1.0956, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.4155233241865935, |
|
"grad_norm": 4.156872749328613, |
|
"learning_rate": 9.148148148148149e-06, |
|
"loss": 1.0899, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.4174833398667189, |
|
"grad_norm": 5.194650173187256, |
|
"learning_rate": 9.13888888888889e-06, |
|
"loss": 1.0772, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.41944335554684437, |
|
"grad_norm": 5.057788848876953, |
|
"learning_rate": 9.12962962962963e-06, |
|
"loss": 1.0886, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.4214033712269698, |
|
"grad_norm": 4.849510192871094, |
|
"learning_rate": 9.120370370370372e-06, |
|
"loss": 1.1027, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.42336338690709524, |
|
"grad_norm": 4.8577704429626465, |
|
"learning_rate": 9.111111111111112e-06, |
|
"loss": 1.0809, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.4253234025872207, |
|
"grad_norm": 4.937686443328857, |
|
"learning_rate": 9.101851851851853e-06, |
|
"loss": 1.0819, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.4272834182673461, |
|
"grad_norm": 5.295153617858887, |
|
"learning_rate": 9.092592592592593e-06, |
|
"loss": 1.0734, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.42924343394747155, |
|
"grad_norm": 5.876978874206543, |
|
"learning_rate": 9.083333333333333e-06, |
|
"loss": 1.078, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.43120344962759705, |
|
"grad_norm": 4.725461006164551, |
|
"learning_rate": 9.074074074074075e-06, |
|
"loss": 1.0787, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.4331634653077225, |
|
"grad_norm": 4.425513744354248, |
|
"learning_rate": 9.064814814814816e-06, |
|
"loss": 1.1066, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.4351234809878479, |
|
"grad_norm": 4.781891822814941, |
|
"learning_rate": 9.055555555555556e-06, |
|
"loss": 1.083, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.43708349666797336, |
|
"grad_norm": 5.292329788208008, |
|
"learning_rate": 9.046296296296298e-06, |
|
"loss": 1.094, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.4390435123480988, |
|
"grad_norm": 5.246599197387695, |
|
"learning_rate": 9.037037037037037e-06, |
|
"loss": 1.0903, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.44100352802822423, |
|
"grad_norm": 4.701263427734375, |
|
"learning_rate": 9.027777777777779e-06, |
|
"loss": 1.0811, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.44296354370834967, |
|
"grad_norm": 5.434783458709717, |
|
"learning_rate": 9.01851851851852e-06, |
|
"loss": 1.0862, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.4449235593884751, |
|
"grad_norm": 5.898828506469727, |
|
"learning_rate": 9.00925925925926e-06, |
|
"loss": 1.0972, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.44688357506860055, |
|
"grad_norm": 5.089272975921631, |
|
"learning_rate": 9e-06, |
|
"loss": 1.091, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.448843590748726, |
|
"grad_norm": 6.887242317199707, |
|
"learning_rate": 8.99074074074074e-06, |
|
"loss": 1.0785, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.4508036064288514, |
|
"grad_norm": 5.111120700836182, |
|
"learning_rate": 8.981481481481483e-06, |
|
"loss": 1.0682, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.45276362210897686, |
|
"grad_norm": 5.331275939941406, |
|
"learning_rate": 8.972222222222223e-06, |
|
"loss": 1.0854, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.4547236377891023, |
|
"grad_norm": 5.59519100189209, |
|
"learning_rate": 8.962962962962963e-06, |
|
"loss": 1.0883, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.45668365346922773, |
|
"grad_norm": 5.6016364097595215, |
|
"learning_rate": 8.953703703703704e-06, |
|
"loss": 1.0799, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.4586436691493532, |
|
"grad_norm": 4.625690460205078, |
|
"learning_rate": 8.944444444444446e-06, |
|
"loss": 1.0716, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.4606036848294786, |
|
"grad_norm": 4.6536688804626465, |
|
"learning_rate": 8.935185185185186e-06, |
|
"loss": 1.0624, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.4625637005096041, |
|
"grad_norm": 4.807366847991943, |
|
"learning_rate": 8.925925925925927e-06, |
|
"loss": 1.0829, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.46452371618972954, |
|
"grad_norm": 4.950624942779541, |
|
"learning_rate": 8.916666666666667e-06, |
|
"loss": 1.0861, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.466483731869855, |
|
"grad_norm": 4.980051517486572, |
|
"learning_rate": 8.907407407407408e-06, |
|
"loss": 1.0772, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.4684437475499804, |
|
"grad_norm": 5.208264350891113, |
|
"learning_rate": 8.89814814814815e-06, |
|
"loss": 1.0929, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.47040376323010585, |
|
"grad_norm": 5.6281657218933105, |
|
"learning_rate": 8.888888888888888e-06, |
|
"loss": 1.0818, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.4723637789102313, |
|
"grad_norm": 6.125396251678467, |
|
"learning_rate": 8.87962962962963e-06, |
|
"loss": 1.0865, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.47432379459035673, |
|
"grad_norm": 5.989163398742676, |
|
"learning_rate": 8.87037037037037e-06, |
|
"loss": 1.0685, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.47628381027048217, |
|
"grad_norm": 9.247689247131348, |
|
"learning_rate": 8.861111111111111e-06, |
|
"loss": 1.0879, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.4782438259506076, |
|
"grad_norm": 4.848996639251709, |
|
"learning_rate": 8.851851851851853e-06, |
|
"loss": 1.0783, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.48020384163073304, |
|
"grad_norm": 5.04819917678833, |
|
"learning_rate": 8.842592592592594e-06, |
|
"loss": 1.0679, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.4821638573108585, |
|
"grad_norm": 5.755705833435059, |
|
"learning_rate": 8.833333333333334e-06, |
|
"loss": 1.0939, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.4841238729909839, |
|
"grad_norm": 4.5991058349609375, |
|
"learning_rate": 8.824074074074074e-06, |
|
"loss": 1.0775, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.48608388867110935, |
|
"grad_norm": 5.101771354675293, |
|
"learning_rate": 8.814814814814817e-06, |
|
"loss": 1.084, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.4880439043512348, |
|
"grad_norm": 4.9321441650390625, |
|
"learning_rate": 8.805555555555557e-06, |
|
"loss": 1.0762, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.49000392003136023, |
|
"grad_norm": 5.363485813140869, |
|
"learning_rate": 8.796296296296297e-06, |
|
"loss": 1.0595, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.49196393571148567, |
|
"grad_norm": 5.353972434997559, |
|
"learning_rate": 8.787037037037038e-06, |
|
"loss": 1.0689, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.49392395139161116, |
|
"grad_norm": 5.158483505249023, |
|
"learning_rate": 8.777777777777778e-06, |
|
"loss": 1.0711, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.4958839670717366, |
|
"grad_norm": 5.160126686096191, |
|
"learning_rate": 8.76851851851852e-06, |
|
"loss": 1.0886, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.49784398275186204, |
|
"grad_norm": 4.926945686340332, |
|
"learning_rate": 8.759259259259259e-06, |
|
"loss": 1.0856, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.4998039984319875, |
|
"grad_norm": 5.253294467926025, |
|
"learning_rate": 8.750000000000001e-06, |
|
"loss": 1.0876, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.5017640141121129, |
|
"grad_norm": 11.674654960632324, |
|
"learning_rate": 8.740740740740741e-06, |
|
"loss": 1.0784, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.5037240297922383, |
|
"grad_norm": 5.032644271850586, |
|
"learning_rate": 8.731481481481482e-06, |
|
"loss": 1.07, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.5056840454723638, |
|
"grad_norm": 5.1500372886657715, |
|
"learning_rate": 8.722222222222224e-06, |
|
"loss": 1.0932, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.5076440611524892, |
|
"grad_norm": 5.761635780334473, |
|
"learning_rate": 8.712962962962964e-06, |
|
"loss": 1.0816, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.5096040768326147, |
|
"grad_norm": 5.257477760314941, |
|
"learning_rate": 8.703703703703705e-06, |
|
"loss": 1.0712, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.5115640925127402, |
|
"grad_norm": 5.4931416511535645, |
|
"learning_rate": 8.694444444444445e-06, |
|
"loss": 1.0832, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.5135241081928655, |
|
"grad_norm": 6.102149963378906, |
|
"learning_rate": 8.685185185185185e-06, |
|
"loss": 1.076, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.515484123872991, |
|
"grad_norm": 5.596585273742676, |
|
"learning_rate": 8.675925925925926e-06, |
|
"loss": 1.0972, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.5174441395531164, |
|
"grad_norm": 4.6653523445129395, |
|
"learning_rate": 8.666666666666668e-06, |
|
"loss": 1.0872, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.5194041552332419, |
|
"grad_norm": 4.938266277313232, |
|
"learning_rate": 8.657407407407408e-06, |
|
"loss": 1.0717, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.5213641709133673, |
|
"grad_norm": 5.11916446685791, |
|
"learning_rate": 8.648148148148149e-06, |
|
"loss": 1.0955, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.5233241865934928, |
|
"grad_norm": 4.867392539978027, |
|
"learning_rate": 8.63888888888889e-06, |
|
"loss": 1.0769, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.5252842022736182, |
|
"grad_norm": 5.589994430541992, |
|
"learning_rate": 8.62962962962963e-06, |
|
"loss": 1.0808, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.5272442179537437, |
|
"grad_norm": 5.221999645233154, |
|
"learning_rate": 8.620370370370371e-06, |
|
"loss": 1.0627, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.529204233633869, |
|
"grad_norm": 4.844895839691162, |
|
"learning_rate": 8.611111111111112e-06, |
|
"loss": 1.0951, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.5311642493139945, |
|
"grad_norm": 4.937107563018799, |
|
"learning_rate": 8.601851851851852e-06, |
|
"loss": 1.0898, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.5331242649941199, |
|
"grad_norm": 4.758755207061768, |
|
"learning_rate": 8.592592592592593e-06, |
|
"loss": 1.1017, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.5350842806742454, |
|
"grad_norm": 5.145150661468506, |
|
"learning_rate": 8.583333333333333e-06, |
|
"loss": 1.0702, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.5370442963543708, |
|
"grad_norm": 5.257258415222168, |
|
"learning_rate": 8.574074074074075e-06, |
|
"loss": 1.0822, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.5390043120344963, |
|
"grad_norm": 5.826773166656494, |
|
"learning_rate": 8.564814814814816e-06, |
|
"loss": 1.0714, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.5409643277146217, |
|
"grad_norm": 4.5895304679870605, |
|
"learning_rate": 8.555555555555556e-06, |
|
"loss": 1.0866, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.5429243433947472, |
|
"grad_norm": 6.960358619689941, |
|
"learning_rate": 8.546296296296296e-06, |
|
"loss": 1.0942, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.5448843590748726, |
|
"grad_norm": 5.870516300201416, |
|
"learning_rate": 8.537037037037038e-06, |
|
"loss": 1.0937, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.546844374754998, |
|
"grad_norm": 4.99727725982666, |
|
"learning_rate": 8.527777777777779e-06, |
|
"loss": 1.0851, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.5488043904351235, |
|
"grad_norm": 4.9571003913879395, |
|
"learning_rate": 8.518518518518519e-06, |
|
"loss": 1.085, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.5488043904351235, |
|
"eval_loss": 1.073391318321228, |
|
"eval_runtime": 14.0283, |
|
"eval_samples_per_second": 46.62, |
|
"eval_steps_per_second": 5.845, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.5507644061152489, |
|
"grad_norm": 4.457803726196289, |
|
"learning_rate": 8.509259259259261e-06, |
|
"loss": 1.0787, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.5527244217953744, |
|
"grad_norm": 5.098880290985107, |
|
"learning_rate": 8.5e-06, |
|
"loss": 1.0672, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.5546844374754998, |
|
"grad_norm": 5.3498735427856445, |
|
"learning_rate": 8.490740740740742e-06, |
|
"loss": 1.0852, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.5566444531556253, |
|
"grad_norm": 5.1486921310424805, |
|
"learning_rate": 8.481481481481482e-06, |
|
"loss": 1.0564, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.5586044688357507, |
|
"grad_norm": 5.613958358764648, |
|
"learning_rate": 8.472222222222223e-06, |
|
"loss": 1.0864, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.5605644845158761, |
|
"grad_norm": 5.687408924102783, |
|
"learning_rate": 8.462962962962963e-06, |
|
"loss": 1.0871, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.5625245001960015, |
|
"grad_norm": 5.822880268096924, |
|
"learning_rate": 8.453703703703704e-06, |
|
"loss": 1.0708, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.564484515876127, |
|
"grad_norm": 5.982561111450195, |
|
"learning_rate": 8.444444444444446e-06, |
|
"loss": 1.0633, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.5664445315562524, |
|
"grad_norm": 4.887526035308838, |
|
"learning_rate": 8.435185185185186e-06, |
|
"loss": 1.0823, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.5684045472363779, |
|
"grad_norm": 5.810245037078857, |
|
"learning_rate": 8.425925925925926e-06, |
|
"loss": 1.0683, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.5703645629165033, |
|
"grad_norm": 4.757987022399902, |
|
"learning_rate": 8.416666666666667e-06, |
|
"loss": 1.0703, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.5723245785966288, |
|
"grad_norm": 5.597621440887451, |
|
"learning_rate": 8.407407407407409e-06, |
|
"loss": 1.0861, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.5742845942767543, |
|
"grad_norm": 9.786075592041016, |
|
"learning_rate": 8.39814814814815e-06, |
|
"loss": 1.0755, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.5762446099568796, |
|
"grad_norm": 4.969895362854004, |
|
"learning_rate": 8.38888888888889e-06, |
|
"loss": 1.0907, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.5782046256370051, |
|
"grad_norm": 5.833816051483154, |
|
"learning_rate": 8.37962962962963e-06, |
|
"loss": 1.0679, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.5801646413171305, |
|
"grad_norm": 5.07546329498291, |
|
"learning_rate": 8.37037037037037e-06, |
|
"loss": 1.0705, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.582124656997256, |
|
"grad_norm": 5.735626220703125, |
|
"learning_rate": 8.361111111111113e-06, |
|
"loss": 1.0676, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.5840846726773814, |
|
"grad_norm": 4.732352256774902, |
|
"learning_rate": 8.351851851851851e-06, |
|
"loss": 1.0809, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.5860446883575069, |
|
"grad_norm": 6.006665229797363, |
|
"learning_rate": 8.342592592592593e-06, |
|
"loss": 1.0624, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.5880047040376323, |
|
"grad_norm": 6.649717330932617, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 1.072, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.5899647197177578, |
|
"grad_norm": 5.156081199645996, |
|
"learning_rate": 8.324074074074074e-06, |
|
"loss": 1.0791, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.5919247353978832, |
|
"grad_norm": 5.489958763122559, |
|
"learning_rate": 8.314814814814816e-06, |
|
"loss": 1.0803, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.5938847510780086, |
|
"grad_norm": 5.912964344024658, |
|
"learning_rate": 8.305555555555557e-06, |
|
"loss": 1.0888, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.595844766758134, |
|
"grad_norm": 5.113432884216309, |
|
"learning_rate": 8.296296296296297e-06, |
|
"loss": 1.0728, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.5978047824382595, |
|
"grad_norm": 4.774835109710693, |
|
"learning_rate": 8.287037037037037e-06, |
|
"loss": 1.0772, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.5997647981183849, |
|
"grad_norm": 5.954660892486572, |
|
"learning_rate": 8.277777777777778e-06, |
|
"loss": 1.0805, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.6017248137985104, |
|
"grad_norm": 4.933722019195557, |
|
"learning_rate": 8.26851851851852e-06, |
|
"loss": 1.0678, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.6036848294786358, |
|
"grad_norm": 5.152841567993164, |
|
"learning_rate": 8.25925925925926e-06, |
|
"loss": 1.0831, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.6056448451587613, |
|
"grad_norm": 5.011537551879883, |
|
"learning_rate": 8.25e-06, |
|
"loss": 1.0841, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.6076048608388868, |
|
"grad_norm": 4.196855545043945, |
|
"learning_rate": 8.240740740740741e-06, |
|
"loss": 1.0647, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.6095648765190121, |
|
"grad_norm": 5.21035099029541, |
|
"learning_rate": 8.231481481481483e-06, |
|
"loss": 1.049, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.6115248921991376, |
|
"grad_norm": 26.132904052734375, |
|
"learning_rate": 8.222222222222222e-06, |
|
"loss": 1.0799, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.613484907879263, |
|
"grad_norm": 6.4696760177612305, |
|
"learning_rate": 8.212962962962964e-06, |
|
"loss": 1.0666, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.6154449235593885, |
|
"grad_norm": 4.687003135681152, |
|
"learning_rate": 8.203703703703704e-06, |
|
"loss": 1.0732, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.6174049392395139, |
|
"grad_norm": 5.188929557800293, |
|
"learning_rate": 8.194444444444445e-06, |
|
"loss": 1.0911, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.6193649549196394, |
|
"grad_norm": 4.781430244445801, |
|
"learning_rate": 8.185185185185187e-06, |
|
"loss": 1.0705, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.6213249705997648, |
|
"grad_norm": 4.717843055725098, |
|
"learning_rate": 8.175925925925925e-06, |
|
"loss": 1.0822, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.6232849862798903, |
|
"grad_norm": 5.404903888702393, |
|
"learning_rate": 8.166666666666668e-06, |
|
"loss": 1.0917, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.6252450019600156, |
|
"grad_norm": 5.684728145599365, |
|
"learning_rate": 8.157407407407408e-06, |
|
"loss": 1.0842, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.6272050176401411, |
|
"grad_norm": 4.7895965576171875, |
|
"learning_rate": 8.148148148148148e-06, |
|
"loss": 1.0652, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.6291650333202665, |
|
"grad_norm": 6.572015285491943, |
|
"learning_rate": 8.138888888888889e-06, |
|
"loss": 1.0959, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.631125049000392, |
|
"grad_norm": 4.640923976898193, |
|
"learning_rate": 8.12962962962963e-06, |
|
"loss": 1.0896, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.6330850646805174, |
|
"grad_norm": 4.732535362243652, |
|
"learning_rate": 8.120370370370371e-06, |
|
"loss": 1.0646, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.6350450803606429, |
|
"grad_norm": 5.692999362945557, |
|
"learning_rate": 8.111111111111112e-06, |
|
"loss": 1.0774, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.6370050960407684, |
|
"grad_norm": 5.28448486328125, |
|
"learning_rate": 8.101851851851854e-06, |
|
"loss": 1.0844, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.6389651117208938, |
|
"grad_norm": 3.9686877727508545, |
|
"learning_rate": 8.092592592592592e-06, |
|
"loss": 1.0626, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.6409251274010193, |
|
"grad_norm": 6.364278793334961, |
|
"learning_rate": 8.083333333333334e-06, |
|
"loss": 1.0696, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.6428851430811446, |
|
"grad_norm": 4.789191722869873, |
|
"learning_rate": 8.074074074074075e-06, |
|
"loss": 1.0696, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.6448451587612701, |
|
"grad_norm": 4.61089563369751, |
|
"learning_rate": 8.064814814814815e-06, |
|
"loss": 1.0701, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.6468051744413955, |
|
"grad_norm": 4.959714889526367, |
|
"learning_rate": 8.055555555555557e-06, |
|
"loss": 1.0852, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.648765190121521, |
|
"grad_norm": 4.559225082397461, |
|
"learning_rate": 8.046296296296296e-06, |
|
"loss": 1.0708, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.6507252058016464, |
|
"grad_norm": 4.9751691818237305, |
|
"learning_rate": 8.037037037037038e-06, |
|
"loss": 1.0545, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.6526852214817719, |
|
"grad_norm": 5.153299808502197, |
|
"learning_rate": 8.027777777777778e-06, |
|
"loss": 1.074, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.6546452371618973, |
|
"grad_norm": 5.507195472717285, |
|
"learning_rate": 8.018518518518519e-06, |
|
"loss": 1.0772, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.6566052528420228, |
|
"grad_norm": 4.8530449867248535, |
|
"learning_rate": 8.00925925925926e-06, |
|
"loss": 1.087, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.6585652685221481, |
|
"grad_norm": 4.754231929779053, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.0792, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.6605252842022736, |
|
"grad_norm": 4.642824172973633, |
|
"learning_rate": 7.990740740740742e-06, |
|
"loss": 1.0681, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.662485299882399, |
|
"grad_norm": 5.338252067565918, |
|
"learning_rate": 7.981481481481482e-06, |
|
"loss": 1.0579, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.6644453155625245, |
|
"grad_norm": 5.290416240692139, |
|
"learning_rate": 7.972222222222224e-06, |
|
"loss": 1.0523, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.6664053312426499, |
|
"grad_norm": 4.5010294914245605, |
|
"learning_rate": 7.962962962962963e-06, |
|
"loss": 1.0694, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.6683653469227754, |
|
"grad_norm": 5.253542900085449, |
|
"learning_rate": 7.953703703703705e-06, |
|
"loss": 1.0745, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.6703253626029009, |
|
"grad_norm": 4.743661880493164, |
|
"learning_rate": 7.944444444444445e-06, |
|
"loss": 1.0586, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.6722853782830263, |
|
"grad_norm": 5.1141839027404785, |
|
"learning_rate": 7.935185185185186e-06, |
|
"loss": 1.0767, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.6742453939631518, |
|
"grad_norm": 4.729719638824463, |
|
"learning_rate": 7.925925925925926e-06, |
|
"loss": 1.0841, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.6762054096432771, |
|
"grad_norm": 5.445465087890625, |
|
"learning_rate": 7.916666666666667e-06, |
|
"loss": 1.0901, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.6781654253234026, |
|
"grad_norm": 4.362582683563232, |
|
"learning_rate": 7.907407407407409e-06, |
|
"loss": 1.0634, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.680125441003528, |
|
"grad_norm": 4.604685306549072, |
|
"learning_rate": 7.898148148148149e-06, |
|
"loss": 1.0631, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.6820854566836535, |
|
"grad_norm": 5.426617622375488, |
|
"learning_rate": 7.88888888888889e-06, |
|
"loss": 1.0691, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.6840454723637789, |
|
"grad_norm": 4.578293323516846, |
|
"learning_rate": 7.87962962962963e-06, |
|
"loss": 1.0761, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.6860054880439044, |
|
"grad_norm": 5.362606525421143, |
|
"learning_rate": 7.870370370370372e-06, |
|
"loss": 1.0836, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.6860054880439044, |
|
"eval_loss": 1.0641947984695435, |
|
"eval_runtime": 14.0879, |
|
"eval_samples_per_second": 46.423, |
|
"eval_steps_per_second": 5.821, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.6879655037240298, |
|
"grad_norm": 4.338893890380859, |
|
"learning_rate": 7.861111111111112e-06, |
|
"loss": 1.0604, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.6899255194041553, |
|
"grad_norm": 5.606161594390869, |
|
"learning_rate": 7.851851851851853e-06, |
|
"loss": 1.0869, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.6918855350842806, |
|
"grad_norm": 4.934096336364746, |
|
"learning_rate": 7.842592592592593e-06, |
|
"loss": 1.07, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.6938455507644061, |
|
"grad_norm": 4.351364612579346, |
|
"learning_rate": 7.833333333333333e-06, |
|
"loss": 1.0682, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.6958055664445315, |
|
"grad_norm": 4.8932976722717285, |
|
"learning_rate": 7.824074074074076e-06, |
|
"loss": 1.0766, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.697765582124657, |
|
"grad_norm": 4.501565456390381, |
|
"learning_rate": 7.814814814814816e-06, |
|
"loss": 1.0612, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.6997255978047824, |
|
"grad_norm": 4.929937362670898, |
|
"learning_rate": 7.805555555555556e-06, |
|
"loss": 1.0609, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.7016856134849079, |
|
"grad_norm": 5.501171112060547, |
|
"learning_rate": 7.796296296296297e-06, |
|
"loss": 1.0583, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.7036456291650334, |
|
"grad_norm": 4.9919891357421875, |
|
"learning_rate": 7.787037037037037e-06, |
|
"loss": 1.0603, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.7056056448451588, |
|
"grad_norm": 4.892634868621826, |
|
"learning_rate": 7.77777777777778e-06, |
|
"loss": 1.0718, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.7075656605252842, |
|
"grad_norm": 5.0666632652282715, |
|
"learning_rate": 7.76851851851852e-06, |
|
"loss": 1.0733, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.7095256762054096, |
|
"grad_norm": 4.797816753387451, |
|
"learning_rate": 7.75925925925926e-06, |
|
"loss": 1.0664, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.7114856918855351, |
|
"grad_norm": 5.004721164703369, |
|
"learning_rate": 7.75e-06, |
|
"loss": 1.0847, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.7134457075656605, |
|
"grad_norm": 4.80883264541626, |
|
"learning_rate": 7.74074074074074e-06, |
|
"loss": 1.075, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.715405723245786, |
|
"grad_norm": 5.439119815826416, |
|
"learning_rate": 7.731481481481483e-06, |
|
"loss": 1.0517, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.7173657389259114, |
|
"grad_norm": 4.244631290435791, |
|
"learning_rate": 7.722222222222223e-06, |
|
"loss": 1.0836, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.7193257546060369, |
|
"grad_norm": 4.710737705230713, |
|
"learning_rate": 7.712962962962964e-06, |
|
"loss": 1.0748, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.7212857702861623, |
|
"grad_norm": 5.23133659362793, |
|
"learning_rate": 7.703703703703704e-06, |
|
"loss": 1.0843, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.7232457859662877, |
|
"grad_norm": 4.706710338592529, |
|
"learning_rate": 7.694444444444446e-06, |
|
"loss": 1.0637, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.7252058016464131, |
|
"grad_norm": 4.4408955574035645, |
|
"learning_rate": 7.685185185185185e-06, |
|
"loss": 1.0945, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.7271658173265386, |
|
"grad_norm": 5.1011199951171875, |
|
"learning_rate": 7.675925925925927e-06, |
|
"loss": 1.0573, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.729125833006664, |
|
"grad_norm": 4.85494327545166, |
|
"learning_rate": 7.666666666666667e-06, |
|
"loss": 1.0686, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.7310858486867895, |
|
"grad_norm": 4.990062236785889, |
|
"learning_rate": 7.657407407407408e-06, |
|
"loss": 1.052, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.733045864366915, |
|
"grad_norm": 4.779956817626953, |
|
"learning_rate": 7.64814814814815e-06, |
|
"loss": 1.0619, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.7350058800470404, |
|
"grad_norm": 4.921076774597168, |
|
"learning_rate": 7.638888888888888e-06, |
|
"loss": 1.0834, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.7369658957271659, |
|
"grad_norm": 4.729534149169922, |
|
"learning_rate": 7.62962962962963e-06, |
|
"loss": 1.0643, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.7389259114072912, |
|
"grad_norm": 4.321104049682617, |
|
"learning_rate": 7.620370370370372e-06, |
|
"loss": 1.0736, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.7408859270874167, |
|
"grad_norm": 5.252387523651123, |
|
"learning_rate": 7.611111111111111e-06, |
|
"loss": 1.0515, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.7428459427675421, |
|
"grad_norm": 4.770890712738037, |
|
"learning_rate": 7.6018518518518525e-06, |
|
"loss": 1.0865, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.7448059584476676, |
|
"grad_norm": 4.786081790924072, |
|
"learning_rate": 7.592592592592594e-06, |
|
"loss": 1.0727, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.746765974127793, |
|
"grad_norm": 5.215248107910156, |
|
"learning_rate": 7.583333333333333e-06, |
|
"loss": 1.0657, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.7487259898079185, |
|
"grad_norm": 8.386008262634277, |
|
"learning_rate": 7.5740740740740745e-06, |
|
"loss": 1.0774, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.7506860054880439, |
|
"grad_norm": 5.009541034698486, |
|
"learning_rate": 7.564814814814816e-06, |
|
"loss": 1.0517, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.7526460211681694, |
|
"grad_norm": 5.512828826904297, |
|
"learning_rate": 7.555555555555556e-06, |
|
"loss": 1.0661, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.7546060368482947, |
|
"grad_norm": 4.983898162841797, |
|
"learning_rate": 7.546296296296297e-06, |
|
"loss": 1.0561, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.7565660525284202, |
|
"grad_norm": 4.687797546386719, |
|
"learning_rate": 7.537037037037037e-06, |
|
"loss": 1.0623, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.7585260682085456, |
|
"grad_norm": 4.924409866333008, |
|
"learning_rate": 7.527777777777778e-06, |
|
"loss": 1.0557, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.7604860838886711, |
|
"grad_norm": 4.704287528991699, |
|
"learning_rate": 7.518518518518519e-06, |
|
"loss": 1.0732, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.7624460995687965, |
|
"grad_norm": 5.610473155975342, |
|
"learning_rate": 7.50925925925926e-06, |
|
"loss": 1.0827, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.764406115248922, |
|
"grad_norm": 5.7728590965271, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 1.0738, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.7663661309290475, |
|
"grad_norm": 4.7245659828186035, |
|
"learning_rate": 7.4907407407407414e-06, |
|
"loss": 1.0665, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.7683261466091729, |
|
"grad_norm": 6.519947528839111, |
|
"learning_rate": 7.481481481481482e-06, |
|
"loss": 1.0631, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.7702861622892984, |
|
"grad_norm": 4.989842891693115, |
|
"learning_rate": 7.472222222222223e-06, |
|
"loss": 1.053, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.7722461779694237, |
|
"grad_norm": 4.687224864959717, |
|
"learning_rate": 7.462962962962964e-06, |
|
"loss": 1.0593, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.7742061936495492, |
|
"grad_norm": 4.693930625915527, |
|
"learning_rate": 7.453703703703704e-06, |
|
"loss": 1.0665, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.7761662093296746, |
|
"grad_norm": 5.265392303466797, |
|
"learning_rate": 7.444444444444445e-06, |
|
"loss": 1.042, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.7781262250098001, |
|
"grad_norm": 5.122781753540039, |
|
"learning_rate": 7.4351851851851855e-06, |
|
"loss": 1.0748, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.7800862406899255, |
|
"grad_norm": 4.934235095977783, |
|
"learning_rate": 7.425925925925927e-06, |
|
"loss": 1.0603, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.782046256370051, |
|
"grad_norm": 4.4583587646484375, |
|
"learning_rate": 7.416666666666668e-06, |
|
"loss": 1.0724, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.7840062720501764, |
|
"grad_norm": 5.3386030197143555, |
|
"learning_rate": 7.4074074074074075e-06, |
|
"loss": 1.0651, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.7859662877303019, |
|
"grad_norm": 4.659664154052734, |
|
"learning_rate": 7.398148148148149e-06, |
|
"loss": 1.0525, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.7879263034104272, |
|
"grad_norm": 4.794917106628418, |
|
"learning_rate": 7.38888888888889e-06, |
|
"loss": 1.0561, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.7898863190905527, |
|
"grad_norm": 5.210708141326904, |
|
"learning_rate": 7.3796296296296295e-06, |
|
"loss": 1.0641, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.7918463347706781, |
|
"grad_norm": 4.441596031188965, |
|
"learning_rate": 7.370370370370371e-06, |
|
"loss": 1.0696, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.7938063504508036, |
|
"grad_norm": 5.239253044128418, |
|
"learning_rate": 7.361111111111112e-06, |
|
"loss": 1.0576, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.795766366130929, |
|
"grad_norm": 5.195384502410889, |
|
"learning_rate": 7.351851851851852e-06, |
|
"loss": 1.0556, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.7977263818110545, |
|
"grad_norm": 5.0446648597717285, |
|
"learning_rate": 7.342592592592594e-06, |
|
"loss": 1.067, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.79968639749118, |
|
"grad_norm": 5.061046600341797, |
|
"learning_rate": 7.333333333333333e-06, |
|
"loss": 1.0587, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.8016464131713054, |
|
"grad_norm": 7.661867141723633, |
|
"learning_rate": 7.324074074074074e-06, |
|
"loss": 1.0694, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.8036064288514309, |
|
"grad_norm": 4.821944236755371, |
|
"learning_rate": 7.314814814814816e-06, |
|
"loss": 1.0675, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.8055664445315562, |
|
"grad_norm": 4.810634613037109, |
|
"learning_rate": 7.305555555555556e-06, |
|
"loss": 1.0589, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.8075264602116817, |
|
"grad_norm": 5.0643157958984375, |
|
"learning_rate": 7.296296296296297e-06, |
|
"loss": 1.0714, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.8094864758918071, |
|
"grad_norm": 4.812649250030518, |
|
"learning_rate": 7.287037037037038e-06, |
|
"loss": 1.0635, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.8114464915719326, |
|
"grad_norm": 4.7638840675354, |
|
"learning_rate": 7.277777777777778e-06, |
|
"loss": 1.064, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.813406507252058, |
|
"grad_norm": 4.386782646179199, |
|
"learning_rate": 7.268518518518519e-06, |
|
"loss": 1.0726, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.8153665229321835, |
|
"grad_norm": 4.593876838684082, |
|
"learning_rate": 7.2592592592592605e-06, |
|
"loss": 1.0542, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.8173265386123089, |
|
"grad_norm": 4.822241306304932, |
|
"learning_rate": 7.25e-06, |
|
"loss": 1.0541, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.8192865542924344, |
|
"grad_norm": 4.806110858917236, |
|
"learning_rate": 7.240740740740741e-06, |
|
"loss": 1.0522, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.8212465699725597, |
|
"grad_norm": 4.5700788497924805, |
|
"learning_rate": 7.231481481481482e-06, |
|
"loss": 1.0672, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.8232065856526852, |
|
"grad_norm": 4.629611492156982, |
|
"learning_rate": 7.222222222222223e-06, |
|
"loss": 1.072, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.8232065856526852, |
|
"eval_loss": 1.0551141500473022, |
|
"eval_runtime": 13.9585, |
|
"eval_samples_per_second": 46.853, |
|
"eval_steps_per_second": 5.875, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.8251666013328106, |
|
"grad_norm": 5.06083345413208, |
|
"learning_rate": 7.212962962962964e-06, |
|
"loss": 1.0641, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.8271266170129361, |
|
"grad_norm": 4.53200626373291, |
|
"learning_rate": 7.203703703703704e-06, |
|
"loss": 1.0764, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.8290866326930616, |
|
"grad_norm": 4.572458744049072, |
|
"learning_rate": 7.194444444444445e-06, |
|
"loss": 1.0616, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.831046648373187, |
|
"grad_norm": 21.508056640625, |
|
"learning_rate": 7.185185185185186e-06, |
|
"loss": 1.0606, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.8330066640533125, |
|
"grad_norm": 5.574312210083008, |
|
"learning_rate": 7.1759259259259266e-06, |
|
"loss": 1.0736, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.8349666797334379, |
|
"grad_norm": 4.778395175933838, |
|
"learning_rate": 7.166666666666667e-06, |
|
"loss": 1.0561, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.8369266954135633, |
|
"grad_norm": 4.696834564208984, |
|
"learning_rate": 7.157407407407408e-06, |
|
"loss": 1.0753, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.8388867110936887, |
|
"grad_norm": 4.8193769454956055, |
|
"learning_rate": 7.1481481481481486e-06, |
|
"loss": 1.0454, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.8408467267738142, |
|
"grad_norm": 4.693731784820557, |
|
"learning_rate": 7.13888888888889e-06, |
|
"loss": 1.0634, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.8428067424539396, |
|
"grad_norm": 5.045989990234375, |
|
"learning_rate": 7.129629629629629e-06, |
|
"loss": 1.057, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.8447667581340651, |
|
"grad_norm": 5.245114326477051, |
|
"learning_rate": 7.120370370370371e-06, |
|
"loss": 1.0435, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 0.8467267738141905, |
|
"grad_norm": 5.094894886016846, |
|
"learning_rate": 7.111111111111112e-06, |
|
"loss": 1.0657, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.848686789494316, |
|
"grad_norm": 4.630204200744629, |
|
"learning_rate": 7.101851851851852e-06, |
|
"loss": 1.0729, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 0.8506468051744414, |
|
"grad_norm": 4.463089942932129, |
|
"learning_rate": 7.0925925925925935e-06, |
|
"loss": 1.0573, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.8526068208545668, |
|
"grad_norm": 4.479245185852051, |
|
"learning_rate": 7.083333333333335e-06, |
|
"loss": 1.0534, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.8545668365346922, |
|
"grad_norm": 4.832218170166016, |
|
"learning_rate": 7.074074074074074e-06, |
|
"loss": 1.0499, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.8565268522148177, |
|
"grad_norm": 4.618699550628662, |
|
"learning_rate": 7.0648148148148155e-06, |
|
"loss": 1.073, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 0.8584868678949431, |
|
"grad_norm": 4.646695137023926, |
|
"learning_rate": 7.055555555555557e-06, |
|
"loss": 1.0662, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.8604468835750686, |
|
"grad_norm": 4.74224853515625, |
|
"learning_rate": 7.046296296296296e-06, |
|
"loss": 1.0516, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 0.8624068992551941, |
|
"grad_norm": 4.836453914642334, |
|
"learning_rate": 7.0370370370370375e-06, |
|
"loss": 1.0622, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.8643669149353195, |
|
"grad_norm": 5.0554046630859375, |
|
"learning_rate": 7.027777777777778e-06, |
|
"loss": 1.0725, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.866326930615445, |
|
"grad_norm": 4.8123579025268555, |
|
"learning_rate": 7.018518518518519e-06, |
|
"loss": 1.0451, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.8682869462955703, |
|
"grad_norm": 4.815413951873779, |
|
"learning_rate": 7.00925925925926e-06, |
|
"loss": 1.057, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 0.8702469619756958, |
|
"grad_norm": 5.103263854980469, |
|
"learning_rate": 7e-06, |
|
"loss": 1.073, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.8722069776558212, |
|
"grad_norm": 5.115508556365967, |
|
"learning_rate": 6.990740740740741e-06, |
|
"loss": 1.058, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.8741669933359467, |
|
"grad_norm": 4.976015567779541, |
|
"learning_rate": 6.981481481481482e-06, |
|
"loss": 1.0601, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.8761270090160721, |
|
"grad_norm": 5.111723899841309, |
|
"learning_rate": 6.972222222222223e-06, |
|
"loss": 1.0735, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 0.8780870246961976, |
|
"grad_norm": 4.711007118225098, |
|
"learning_rate": 6.962962962962964e-06, |
|
"loss": 1.0723, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.880047040376323, |
|
"grad_norm": 4.8820881843566895, |
|
"learning_rate": 6.953703703703704e-06, |
|
"loss": 1.0562, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 0.8820070560564485, |
|
"grad_norm": 5.674580097198486, |
|
"learning_rate": 6.944444444444445e-06, |
|
"loss": 1.0657, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.8839670717365739, |
|
"grad_norm": 4.517936706542969, |
|
"learning_rate": 6.935185185185186e-06, |
|
"loss": 1.0604, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 0.8859270874166993, |
|
"grad_norm": 5.491835594177246, |
|
"learning_rate": 6.9259259259259256e-06, |
|
"loss": 1.0635, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.8878871030968247, |
|
"grad_norm": 5.377597808837891, |
|
"learning_rate": 6.916666666666667e-06, |
|
"loss": 1.0567, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 0.8898471187769502, |
|
"grad_norm": 5.251382350921631, |
|
"learning_rate": 6.907407407407408e-06, |
|
"loss": 1.0564, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.8918071344570757, |
|
"grad_norm": 4.623981952667236, |
|
"learning_rate": 6.898148148148148e-06, |
|
"loss": 1.0663, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.8937671501372011, |
|
"grad_norm": 4.460480690002441, |
|
"learning_rate": 6.88888888888889e-06, |
|
"loss": 1.0773, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.8957271658173266, |
|
"grad_norm": 4.909152984619141, |
|
"learning_rate": 6.879629629629631e-06, |
|
"loss": 1.0462, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 0.897687181497452, |
|
"grad_norm": 4.5406951904296875, |
|
"learning_rate": 6.8703703703703704e-06, |
|
"loss": 1.0619, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.8996471971775775, |
|
"grad_norm": 4.8268890380859375, |
|
"learning_rate": 6.861111111111112e-06, |
|
"loss": 1.0749, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 0.9016072128577028, |
|
"grad_norm": 5.024204254150391, |
|
"learning_rate": 6.851851851851853e-06, |
|
"loss": 1.082, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.9035672285378283, |
|
"grad_norm": 4.6898908615112305, |
|
"learning_rate": 6.842592592592593e-06, |
|
"loss": 1.0527, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 0.9055272442179537, |
|
"grad_norm": 5.445582866668701, |
|
"learning_rate": 6.833333333333334e-06, |
|
"loss": 1.05, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.9074872598980792, |
|
"grad_norm": 4.625185966491699, |
|
"learning_rate": 6.824074074074075e-06, |
|
"loss": 1.0623, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 0.9094472755782046, |
|
"grad_norm": 4.991087913513184, |
|
"learning_rate": 6.814814814814815e-06, |
|
"loss": 1.048, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.9114072912583301, |
|
"grad_norm": 5.890523433685303, |
|
"learning_rate": 6.8055555555555566e-06, |
|
"loss": 1.0539, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.9133673069384555, |
|
"grad_norm": 4.583831787109375, |
|
"learning_rate": 6.796296296296296e-06, |
|
"loss": 1.0616, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.915327322618581, |
|
"grad_norm": 5.098804473876953, |
|
"learning_rate": 6.787037037037037e-06, |
|
"loss": 1.0555, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 0.9172873382987063, |
|
"grad_norm": 4.51339864730835, |
|
"learning_rate": 6.777777777777779e-06, |
|
"loss": 1.0563, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.9192473539788318, |
|
"grad_norm": 5.437107086181641, |
|
"learning_rate": 6.768518518518519e-06, |
|
"loss": 1.0639, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 0.9212073696589572, |
|
"grad_norm": 5.220668315887451, |
|
"learning_rate": 6.75925925925926e-06, |
|
"loss": 1.0335, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.9231673853390827, |
|
"grad_norm": 5.214885711669922, |
|
"learning_rate": 6.750000000000001e-06, |
|
"loss": 1.0616, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 0.9251274010192082, |
|
"grad_norm": 4.807358264923096, |
|
"learning_rate": 6.740740740740741e-06, |
|
"loss": 1.0556, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.9270874166993336, |
|
"grad_norm": 4.536919116973877, |
|
"learning_rate": 6.731481481481482e-06, |
|
"loss": 1.0427, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 0.9290474323794591, |
|
"grad_norm": 4.960407733917236, |
|
"learning_rate": 6.7222222222222235e-06, |
|
"loss": 1.0573, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.9310074480595845, |
|
"grad_norm": 5.058611869812012, |
|
"learning_rate": 6.712962962962963e-06, |
|
"loss": 1.0529, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.93296746373971, |
|
"grad_norm": 4.611677169799805, |
|
"learning_rate": 6.703703703703704e-06, |
|
"loss": 1.0609, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.9349274794198353, |
|
"grad_norm": 4.662081241607666, |
|
"learning_rate": 6.694444444444445e-06, |
|
"loss": 1.0585, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.9368874950999608, |
|
"grad_norm": 4.845633029937744, |
|
"learning_rate": 6.685185185185186e-06, |
|
"loss": 1.0591, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.9388475107800862, |
|
"grad_norm": 4.63094425201416, |
|
"learning_rate": 6.675925925925927e-06, |
|
"loss": 1.051, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 0.9408075264602117, |
|
"grad_norm": 5.272851467132568, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 1.0643, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.9427675421403371, |
|
"grad_norm": 5.056580066680908, |
|
"learning_rate": 6.657407407407408e-06, |
|
"loss": 1.0483, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 0.9447275578204626, |
|
"grad_norm": 4.985904693603516, |
|
"learning_rate": 6.648148148148149e-06, |
|
"loss": 1.0526, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.946687573500588, |
|
"grad_norm": 5.002072334289551, |
|
"learning_rate": 6.6388888888888895e-06, |
|
"loss": 1.0693, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.9486475891807135, |
|
"grad_norm": 5.099184036254883, |
|
"learning_rate": 6.62962962962963e-06, |
|
"loss": 1.0629, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.9506076048608388, |
|
"grad_norm": 4.710622310638428, |
|
"learning_rate": 6.620370370370371e-06, |
|
"loss": 1.0458, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.9525676205409643, |
|
"grad_norm": 5.143291473388672, |
|
"learning_rate": 6.6111111111111115e-06, |
|
"loss": 1.0552, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.9545276362210897, |
|
"grad_norm": 5.28003454208374, |
|
"learning_rate": 6.601851851851853e-06, |
|
"loss": 1.0643, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 0.9564876519012152, |
|
"grad_norm": 5.275862216949463, |
|
"learning_rate": 6.592592592592592e-06, |
|
"loss": 1.0566, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.9584476675813407, |
|
"grad_norm": 4.868998050689697, |
|
"learning_rate": 6.5833333333333335e-06, |
|
"loss": 1.0703, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 0.9604076832614661, |
|
"grad_norm": 4.964428424835205, |
|
"learning_rate": 6.574074074074075e-06, |
|
"loss": 1.0564, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.9604076832614661, |
|
"eval_loss": 1.0497612953186035, |
|
"eval_runtime": 14.0356, |
|
"eval_samples_per_second": 46.596, |
|
"eval_steps_per_second": 5.842, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.9623676989415916, |
|
"grad_norm": 4.545341968536377, |
|
"learning_rate": 6.564814814814815e-06, |
|
"loss": 1.0741, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 0.964327714621717, |
|
"grad_norm": 5.679270267486572, |
|
"learning_rate": 6.555555555555556e-06, |
|
"loss": 1.0528, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.9662877303018425, |
|
"grad_norm": 4.5773749351501465, |
|
"learning_rate": 6.546296296296298e-06, |
|
"loss": 1.048, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 0.9682477459819678, |
|
"grad_norm": 5.288699150085449, |
|
"learning_rate": 6.537037037037037e-06, |
|
"loss": 1.0487, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.9702077616620933, |
|
"grad_norm": 5.2343363761901855, |
|
"learning_rate": 6.5277777777777784e-06, |
|
"loss": 1.0523, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.9721677773422187, |
|
"grad_norm": 4.845068454742432, |
|
"learning_rate": 6.51851851851852e-06, |
|
"loss": 1.0519, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.9741277930223442, |
|
"grad_norm": 5.101285457611084, |
|
"learning_rate": 6.509259259259259e-06, |
|
"loss": 1.0618, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 0.9760878087024696, |
|
"grad_norm": 4.684749126434326, |
|
"learning_rate": 6.5000000000000004e-06, |
|
"loss": 1.0506, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.9780478243825951, |
|
"grad_norm": 4.986086368560791, |
|
"learning_rate": 6.490740740740741e-06, |
|
"loss": 1.0621, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 0.9800078400627205, |
|
"grad_norm": 4.658474922180176, |
|
"learning_rate": 6.481481481481482e-06, |
|
"loss": 1.0727, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.981967855742846, |
|
"grad_norm": 5.1348419189453125, |
|
"learning_rate": 6.472222222222223e-06, |
|
"loss": 1.0542, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 0.9839278714229713, |
|
"grad_norm": 5.368491172790527, |
|
"learning_rate": 6.462962962962963e-06, |
|
"loss": 1.0656, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 0.9858878871030968, |
|
"grad_norm": 5.312760829925537, |
|
"learning_rate": 6.453703703703704e-06, |
|
"loss": 1.0587, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 0.9878479027832223, |
|
"grad_norm": 5.235229969024658, |
|
"learning_rate": 6.444444444444445e-06, |
|
"loss": 1.0588, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.9898079184633477, |
|
"grad_norm": 4.761176586151123, |
|
"learning_rate": 6.435185185185186e-06, |
|
"loss": 1.0507, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.9917679341434732, |
|
"grad_norm": 5.440311431884766, |
|
"learning_rate": 6.425925925925927e-06, |
|
"loss": 1.0566, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 0.9937279498235986, |
|
"grad_norm": 4.793478488922119, |
|
"learning_rate": 6.416666666666667e-06, |
|
"loss": 1.0607, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 0.9956879655037241, |
|
"grad_norm": 5.079543113708496, |
|
"learning_rate": 6.407407407407408e-06, |
|
"loss": 1.0494, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 0.9976479811838495, |
|
"grad_norm": 4.377906322479248, |
|
"learning_rate": 6.398148148148149e-06, |
|
"loss": 1.0558, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 0.999607996863975, |
|
"grad_norm": 4.611093521118164, |
|
"learning_rate": 6.3888888888888885e-06, |
|
"loss": 1.0693, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.0015680125441004, |
|
"grad_norm": 4.401882171630859, |
|
"learning_rate": 6.37962962962963e-06, |
|
"loss": 1.0297, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 1.0035280282242258, |
|
"grad_norm": 4.928489685058594, |
|
"learning_rate": 6.370370370370371e-06, |
|
"loss": 1.0276, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 1.0054880439043512, |
|
"grad_norm": 5.5292487144470215, |
|
"learning_rate": 6.361111111111111e-06, |
|
"loss": 1.0267, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 1.0074480595844766, |
|
"grad_norm": 5.00799036026001, |
|
"learning_rate": 6.351851851851853e-06, |
|
"loss": 1.0173, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 1.0094080752646022, |
|
"grad_norm": 4.399227619171143, |
|
"learning_rate": 6.342592592592594e-06, |
|
"loss": 1.0106, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 1.0113680909447276, |
|
"grad_norm": 5.132420539855957, |
|
"learning_rate": 6.333333333333333e-06, |
|
"loss": 1.0212, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 1.013328106624853, |
|
"grad_norm": 5.298924446105957, |
|
"learning_rate": 6.324074074074075e-06, |
|
"loss": 1.0122, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 1.0152881223049783, |
|
"grad_norm": 5.066485404968262, |
|
"learning_rate": 6.314814814814816e-06, |
|
"loss": 1.0114, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 1.017248137985104, |
|
"grad_norm": 6.66790771484375, |
|
"learning_rate": 6.305555555555556e-06, |
|
"loss": 1.0128, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 1.0192081536652293, |
|
"grad_norm": 4.905239105224609, |
|
"learning_rate": 6.296296296296297e-06, |
|
"loss": 1.0118, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.0211681693453547, |
|
"grad_norm": 5.377009868621826, |
|
"learning_rate": 6.287037037037037e-06, |
|
"loss": 1.0229, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 1.0231281850254803, |
|
"grad_norm": 5.1396942138671875, |
|
"learning_rate": 6.277777777777778e-06, |
|
"loss": 1.0221, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 1.0250882007056057, |
|
"grad_norm": 5.001946449279785, |
|
"learning_rate": 6.2685185185185195e-06, |
|
"loss": 1.0461, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 1.027048216385731, |
|
"grad_norm": 6.100106716156006, |
|
"learning_rate": 6.259259259259259e-06, |
|
"loss": 1.0169, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 1.0290082320658565, |
|
"grad_norm": 5.1383209228515625, |
|
"learning_rate": 6.25e-06, |
|
"loss": 1.0282, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 1.030968247745982, |
|
"grad_norm": 5.477851390838623, |
|
"learning_rate": 6.2407407407407415e-06, |
|
"loss": 1.0266, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 1.0329282634261074, |
|
"grad_norm": 5.047489643096924, |
|
"learning_rate": 6.231481481481482e-06, |
|
"loss": 1.0256, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 1.0348882791062328, |
|
"grad_norm": 4.994131565093994, |
|
"learning_rate": 6.222222222222223e-06, |
|
"loss": 1.0224, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 1.0368482947863582, |
|
"grad_norm": 5.558932781219482, |
|
"learning_rate": 6.2129629629629636e-06, |
|
"loss": 1.0186, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 1.0388083104664838, |
|
"grad_norm": 5.365056037902832, |
|
"learning_rate": 6.203703703703704e-06, |
|
"loss": 1.0095, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.0407683261466092, |
|
"grad_norm": 5.53687858581543, |
|
"learning_rate": 6.194444444444445e-06, |
|
"loss": 1.0132, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 1.0427283418267346, |
|
"grad_norm": 5.936549663543701, |
|
"learning_rate": 6.1851851851851856e-06, |
|
"loss": 1.0347, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 1.04468835750686, |
|
"grad_norm": 5.228058338165283, |
|
"learning_rate": 6.175925925925926e-06, |
|
"loss": 1.0234, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 1.0466483731869856, |
|
"grad_norm": 4.896886348724365, |
|
"learning_rate": 6.166666666666667e-06, |
|
"loss": 1.0065, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 1.048608388867111, |
|
"grad_norm": 6.313221454620361, |
|
"learning_rate": 6.157407407407408e-06, |
|
"loss": 1.0191, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 1.0505684045472363, |
|
"grad_norm": 5.8848114013671875, |
|
"learning_rate": 6.148148148148149e-06, |
|
"loss": 1.0378, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 1.052528420227362, |
|
"grad_norm": 5.468878746032715, |
|
"learning_rate": 6.13888888888889e-06, |
|
"loss": 1.0056, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 1.0544884359074873, |
|
"grad_norm": 5.208171367645264, |
|
"learning_rate": 6.12962962962963e-06, |
|
"loss": 1.0194, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 1.0564484515876127, |
|
"grad_norm": 5.1294450759887695, |
|
"learning_rate": 6.120370370370371e-06, |
|
"loss": 1.0084, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 1.058408467267738, |
|
"grad_norm": 5.981677532196045, |
|
"learning_rate": 6.111111111111112e-06, |
|
"loss": 1.0148, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.0603684829478637, |
|
"grad_norm": 5.662644863128662, |
|
"learning_rate": 6.1018518518518525e-06, |
|
"loss": 1.0163, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 1.062328498627989, |
|
"grad_norm": 5.452698230743408, |
|
"learning_rate": 6.092592592592593e-06, |
|
"loss": 1.0047, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 1.0642885143081144, |
|
"grad_norm": 4.969189167022705, |
|
"learning_rate": 6.083333333333333e-06, |
|
"loss": 1.0031, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 1.0662485299882398, |
|
"grad_norm": 5.396177768707275, |
|
"learning_rate": 6.0740740740740745e-06, |
|
"loss": 1.0109, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 1.0682085456683654, |
|
"grad_norm": 6.180945873260498, |
|
"learning_rate": 6.064814814814816e-06, |
|
"loss": 1.0318, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 1.0701685613484908, |
|
"grad_norm": 4.878726959228516, |
|
"learning_rate": 6.055555555555555e-06, |
|
"loss": 1.0257, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 1.0721285770286162, |
|
"grad_norm": 5.634671688079834, |
|
"learning_rate": 6.0462962962962965e-06, |
|
"loss": 1.0241, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 1.0740885927087416, |
|
"grad_norm": 5.971144199371338, |
|
"learning_rate": 6.037037037037038e-06, |
|
"loss": 1.0247, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 1.0760486083888672, |
|
"grad_norm": 5.1925950050354, |
|
"learning_rate": 6.027777777777778e-06, |
|
"loss": 1.0094, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 1.0780086240689926, |
|
"grad_norm": 4.921753406524658, |
|
"learning_rate": 6.018518518518519e-06, |
|
"loss": 1.0233, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.079968639749118, |
|
"grad_norm": 5.969301700592041, |
|
"learning_rate": 6.009259259259261e-06, |
|
"loss": 0.994, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 1.0819286554292433, |
|
"grad_norm": 5.875487804412842, |
|
"learning_rate": 6e-06, |
|
"loss": 1.0171, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 1.083888671109369, |
|
"grad_norm": 5.540329933166504, |
|
"learning_rate": 5.990740740740741e-06, |
|
"loss": 1.0198, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 1.0858486867894943, |
|
"grad_norm": 5.154452323913574, |
|
"learning_rate": 5.981481481481482e-06, |
|
"loss": 1.0174, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 1.0878087024696197, |
|
"grad_norm": 5.469849586486816, |
|
"learning_rate": 5.972222222222222e-06, |
|
"loss": 1.0141, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 1.0897687181497453, |
|
"grad_norm": 6.041581153869629, |
|
"learning_rate": 5.962962962962963e-06, |
|
"loss": 1.0216, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 1.0917287338298707, |
|
"grad_norm": 5.2390522956848145, |
|
"learning_rate": 5.953703703703704e-06, |
|
"loss": 1.0202, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 1.093688749509996, |
|
"grad_norm": 5.63343620300293, |
|
"learning_rate": 5.944444444444445e-06, |
|
"loss": 1.0234, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 1.0956487651901214, |
|
"grad_norm": 14.769401550292969, |
|
"learning_rate": 5.935185185185186e-06, |
|
"loss": 1.019, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 1.097608780870247, |
|
"grad_norm": 5.917840957641602, |
|
"learning_rate": 5.925925925925926e-06, |
|
"loss": 1.0259, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.097608780870247, |
|
"eval_loss": 1.0469825267791748, |
|
"eval_runtime": 14.056, |
|
"eval_samples_per_second": 46.528, |
|
"eval_steps_per_second": 5.834, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.0995687965503724, |
|
"grad_norm": 4.929704666137695, |
|
"learning_rate": 5.916666666666667e-06, |
|
"loss": 1.0146, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 1.1015288122304978, |
|
"grad_norm": 6.432901382446289, |
|
"learning_rate": 5.907407407407408e-06, |
|
"loss": 1.0268, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 1.1034888279106232, |
|
"grad_norm": 5.020516872406006, |
|
"learning_rate": 5.898148148148149e-06, |
|
"loss": 1.0272, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 1.1054488435907488, |
|
"grad_norm": 5.231268882751465, |
|
"learning_rate": 5.88888888888889e-06, |
|
"loss": 1.0046, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 1.1074088592708742, |
|
"grad_norm": 5.203913688659668, |
|
"learning_rate": 5.8796296296296295e-06, |
|
"loss": 1.015, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 1.1093688749509996, |
|
"grad_norm": 5.593926906585693, |
|
"learning_rate": 5.870370370370371e-06, |
|
"loss": 1.0185, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 1.1113288906311252, |
|
"grad_norm": 5.165170669555664, |
|
"learning_rate": 5.861111111111112e-06, |
|
"loss": 1.0056, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 1.1132889063112505, |
|
"grad_norm": 5.521978855133057, |
|
"learning_rate": 5.8518518518518515e-06, |
|
"loss": 1.022, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 1.115248921991376, |
|
"grad_norm": 5.280319690704346, |
|
"learning_rate": 5.842592592592593e-06, |
|
"loss": 1.0357, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 1.1172089376715013, |
|
"grad_norm": 6.761595726013184, |
|
"learning_rate": 5.833333333333334e-06, |
|
"loss": 1.0229, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.119168953351627, |
|
"grad_norm": 5.606836318969727, |
|
"learning_rate": 5.824074074074074e-06, |
|
"loss": 1.019, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 1.1211289690317523, |
|
"grad_norm": 5.351015567779541, |
|
"learning_rate": 5.814814814814816e-06, |
|
"loss": 1.0343, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 1.1230889847118777, |
|
"grad_norm": 4.853587627410889, |
|
"learning_rate": 5.805555555555557e-06, |
|
"loss": 1.0239, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 1.125049000392003, |
|
"grad_norm": 16.355466842651367, |
|
"learning_rate": 5.796296296296296e-06, |
|
"loss": 1.0013, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 1.1270090160721287, |
|
"grad_norm": 5.404659748077393, |
|
"learning_rate": 5.787037037037038e-06, |
|
"loss": 1.0181, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 1.128969031752254, |
|
"grad_norm": 5.103781700134277, |
|
"learning_rate": 5.777777777777778e-06, |
|
"loss": 1.0213, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 1.1309290474323794, |
|
"grad_norm": 5.420756816864014, |
|
"learning_rate": 5.768518518518519e-06, |
|
"loss": 1.0072, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 1.1328890631125048, |
|
"grad_norm": 5.301304817199707, |
|
"learning_rate": 5.75925925925926e-06, |
|
"loss": 0.9901, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 1.1348490787926304, |
|
"grad_norm": 4.864381790161133, |
|
"learning_rate": 5.75e-06, |
|
"loss": 1.0129, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 1.1368090944727558, |
|
"grad_norm": 6.017991542816162, |
|
"learning_rate": 5.740740740740741e-06, |
|
"loss": 1.0113, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 1.1387691101528812, |
|
"grad_norm": 5.6904520988464355, |
|
"learning_rate": 5.7314814814814825e-06, |
|
"loss": 1.0211, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 1.1407291258330066, |
|
"grad_norm": 5.778028964996338, |
|
"learning_rate": 5.722222222222222e-06, |
|
"loss": 1.0247, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 1.1426891415131322, |
|
"grad_norm": 5.682682991027832, |
|
"learning_rate": 5.712962962962963e-06, |
|
"loss": 1.0309, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 1.1446491571932575, |
|
"grad_norm": 5.886664867401123, |
|
"learning_rate": 5.7037037037037045e-06, |
|
"loss": 1.0244, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 1.146609172873383, |
|
"grad_norm": 5.014996528625488, |
|
"learning_rate": 5.694444444444445e-06, |
|
"loss": 1.0207, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 1.1485691885535085, |
|
"grad_norm": 5.563379287719727, |
|
"learning_rate": 5.685185185185186e-06, |
|
"loss": 1.0276, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 1.150529204233634, |
|
"grad_norm": 5.241649627685547, |
|
"learning_rate": 5.675925925925926e-06, |
|
"loss": 1.022, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 1.1524892199137593, |
|
"grad_norm": 5.154331207275391, |
|
"learning_rate": 5.666666666666667e-06, |
|
"loss": 1.0232, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 1.1544492355938847, |
|
"grad_norm": 5.823696136474609, |
|
"learning_rate": 5.657407407407408e-06, |
|
"loss": 1.0364, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 1.1564092512740103, |
|
"grad_norm": 5.451704978942871, |
|
"learning_rate": 5.6481481481481485e-06, |
|
"loss": 1.02, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 1.1583692669541357, |
|
"grad_norm": 5.540503978729248, |
|
"learning_rate": 5.638888888888889e-06, |
|
"loss": 1.0127, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 1.160329282634261, |
|
"grad_norm": 5.463021278381348, |
|
"learning_rate": 5.62962962962963e-06, |
|
"loss": 1.0205, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 1.1622892983143864, |
|
"grad_norm": 5.372107028961182, |
|
"learning_rate": 5.6203703703703705e-06, |
|
"loss": 1.0085, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 1.164249313994512, |
|
"grad_norm": 5.14231538772583, |
|
"learning_rate": 5.611111111111112e-06, |
|
"loss": 1.0189, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 1.1662093296746374, |
|
"grad_norm": 5.416358470916748, |
|
"learning_rate": 5.601851851851853e-06, |
|
"loss": 1.0066, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 1.1681693453547628, |
|
"grad_norm": 5.824409484863281, |
|
"learning_rate": 5.5925925925925926e-06, |
|
"loss": 1.0294, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 1.1701293610348882, |
|
"grad_norm": 5.362888336181641, |
|
"learning_rate": 5.583333333333334e-06, |
|
"loss": 1.011, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 1.1720893767150138, |
|
"grad_norm": 5.583401679992676, |
|
"learning_rate": 5.574074074074075e-06, |
|
"loss": 1.0218, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 1.1740493923951392, |
|
"grad_norm": 5.501253128051758, |
|
"learning_rate": 5.5648148148148154e-06, |
|
"loss": 1.0198, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 1.1760094080752646, |
|
"grad_norm": 5.536640167236328, |
|
"learning_rate": 5.555555555555557e-06, |
|
"loss": 1.0243, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.17796942375539, |
|
"grad_norm": 5.525614261627197, |
|
"learning_rate": 5.546296296296296e-06, |
|
"loss": 1.0234, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 1.1799294394355155, |
|
"grad_norm": 5.270368576049805, |
|
"learning_rate": 5.5370370370370374e-06, |
|
"loss": 1.0214, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 1.181889455115641, |
|
"grad_norm": 5.895664691925049, |
|
"learning_rate": 5.527777777777779e-06, |
|
"loss": 1.0203, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 1.1838494707957663, |
|
"grad_norm": 5.530089378356934, |
|
"learning_rate": 5.518518518518518e-06, |
|
"loss": 1.0254, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 1.185809486475892, |
|
"grad_norm": 5.275720119476318, |
|
"learning_rate": 5.5092592592592595e-06, |
|
"loss": 1.015, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 1.1877695021560173, |
|
"grad_norm": 5.464200496673584, |
|
"learning_rate": 5.500000000000001e-06, |
|
"loss": 1.0134, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 1.1897295178361427, |
|
"grad_norm": 5.803837299346924, |
|
"learning_rate": 5.490740740740741e-06, |
|
"loss": 1.0125, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 1.191689533516268, |
|
"grad_norm": 5.1351752281188965, |
|
"learning_rate": 5.481481481481482e-06, |
|
"loss": 1.0061, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 1.1936495491963937, |
|
"grad_norm": 5.232799530029297, |
|
"learning_rate": 5.4722222222222236e-06, |
|
"loss": 1.0213, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 1.195609564876519, |
|
"grad_norm": 5.996954917907715, |
|
"learning_rate": 5.462962962962963e-06, |
|
"loss": 1.0177, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 1.1975695805566444, |
|
"grad_norm": 7.579484939575195, |
|
"learning_rate": 5.453703703703704e-06, |
|
"loss": 1.0133, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 1.1995295962367698, |
|
"grad_norm": 5.351324081420898, |
|
"learning_rate": 5.444444444444445e-06, |
|
"loss": 1.0019, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 1.2014896119168954, |
|
"grad_norm": 5.5653581619262695, |
|
"learning_rate": 5.435185185185186e-06, |
|
"loss": 1.0069, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 1.2034496275970208, |
|
"grad_norm": 5.198008060455322, |
|
"learning_rate": 5.425925925925926e-06, |
|
"loss": 1.0157, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 1.2054096432771462, |
|
"grad_norm": 5.67602014541626, |
|
"learning_rate": 5.416666666666667e-06, |
|
"loss": 1.0327, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 1.2073696589572718, |
|
"grad_norm": 5.185479164123535, |
|
"learning_rate": 5.407407407407408e-06, |
|
"loss": 1.0118, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 1.2093296746373972, |
|
"grad_norm": 5.254889965057373, |
|
"learning_rate": 5.398148148148149e-06, |
|
"loss": 1.0282, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 1.2112896903175225, |
|
"grad_norm": 5.635478496551514, |
|
"learning_rate": 5.388888888888889e-06, |
|
"loss": 1.0211, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 1.213249705997648, |
|
"grad_norm": 5.388778209686279, |
|
"learning_rate": 5.37962962962963e-06, |
|
"loss": 1.0171, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 1.2152097216777733, |
|
"grad_norm": 5.546689987182617, |
|
"learning_rate": 5.370370370370371e-06, |
|
"loss": 1.0113, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.217169737357899, |
|
"grad_norm": 5.316011428833008, |
|
"learning_rate": 5.361111111111112e-06, |
|
"loss": 1.0183, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 1.2191297530380243, |
|
"grad_norm": 6.057391166687012, |
|
"learning_rate": 5.351851851851853e-06, |
|
"loss": 1.0149, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 1.2210897687181497, |
|
"grad_norm": 5.451624393463135, |
|
"learning_rate": 5.342592592592592e-06, |
|
"loss": 1.0129, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 1.2230497843982753, |
|
"grad_norm": 5.58920955657959, |
|
"learning_rate": 5.333333333333334e-06, |
|
"loss": 1.0152, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 1.2250098000784007, |
|
"grad_norm": 5.132778167724609, |
|
"learning_rate": 5.324074074074075e-06, |
|
"loss": 1.0059, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 1.226969815758526, |
|
"grad_norm": 4.807291030883789, |
|
"learning_rate": 5.314814814814815e-06, |
|
"loss": 1.0146, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 1.2289298314386514, |
|
"grad_norm": 5.3906474113464355, |
|
"learning_rate": 5.305555555555556e-06, |
|
"loss": 1.0045, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 1.230889847118777, |
|
"grad_norm": 5.229481220245361, |
|
"learning_rate": 5.296296296296297e-06, |
|
"loss": 1.0018, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 1.2328498627989024, |
|
"grad_norm": 5.5774126052856445, |
|
"learning_rate": 5.287037037037037e-06, |
|
"loss": 1.0096, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 1.2348098784790278, |
|
"grad_norm": 5.752622127532959, |
|
"learning_rate": 5.2777777777777785e-06, |
|
"loss": 1.0173, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.2348098784790278, |
|
"eval_loss": 1.0454390048980713, |
|
"eval_runtime": 14.0845, |
|
"eval_samples_per_second": 46.434, |
|
"eval_steps_per_second": 5.822, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.2367698941591532, |
|
"grad_norm": 4.995856285095215, |
|
"learning_rate": 5.26851851851852e-06, |
|
"loss": 1.0274, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 1.2387299098392788, |
|
"grad_norm": 5.631256103515625, |
|
"learning_rate": 5.259259259259259e-06, |
|
"loss": 1.029, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 1.2406899255194042, |
|
"grad_norm": 5.617944717407227, |
|
"learning_rate": 5.2500000000000006e-06, |
|
"loss": 1.0087, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 1.2426499411995295, |
|
"grad_norm": 5.174662113189697, |
|
"learning_rate": 5.240740740740741e-06, |
|
"loss": 1.0114, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 1.2446099568796551, |
|
"grad_norm": 5.841761112213135, |
|
"learning_rate": 5.231481481481482e-06, |
|
"loss": 1.0113, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 1.2465699725597805, |
|
"grad_norm": 5.008285999298096, |
|
"learning_rate": 5.2222222222222226e-06, |
|
"loss": 1.0058, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 1.248529988239906, |
|
"grad_norm": 5.529172420501709, |
|
"learning_rate": 5.212962962962963e-06, |
|
"loss": 1.0225, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 1.2504900039200313, |
|
"grad_norm": 6.809742450714111, |
|
"learning_rate": 5.203703703703704e-06, |
|
"loss": 1.0194, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 1.2524500196001567, |
|
"grad_norm": 5.778404712677002, |
|
"learning_rate": 5.1944444444444454e-06, |
|
"loss": 1.0312, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 1.2544100352802823, |
|
"grad_norm": 6.095964431762695, |
|
"learning_rate": 5.185185185185185e-06, |
|
"loss": 1.0155, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.2563700509604077, |
|
"grad_norm": 5.453831672668457, |
|
"learning_rate": 5.175925925925926e-06, |
|
"loss": 1.0155, |
|
"step": 6410 |
|
}, |
|
{ |
|
"epoch": 1.258330066640533, |
|
"grad_norm": 5.425611972808838, |
|
"learning_rate": 5.1666666666666675e-06, |
|
"loss": 1.0353, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 1.2602900823206586, |
|
"grad_norm": 6.835920810699463, |
|
"learning_rate": 5.157407407407408e-06, |
|
"loss": 1.0089, |
|
"step": 6430 |
|
}, |
|
{ |
|
"epoch": 1.262250098000784, |
|
"grad_norm": 8.35204029083252, |
|
"learning_rate": 5.148148148148149e-06, |
|
"loss": 1.0235, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 1.2642101136809094, |
|
"grad_norm": 5.1433234214782715, |
|
"learning_rate": 5.138888888888889e-06, |
|
"loss": 1.0226, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 1.266170129361035, |
|
"grad_norm": 5.907522201538086, |
|
"learning_rate": 5.12962962962963e-06, |
|
"loss": 1.0161, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 1.2681301450411604, |
|
"grad_norm": 5.631725788116455, |
|
"learning_rate": 5.120370370370371e-06, |
|
"loss": 1.014, |
|
"step": 6470 |
|
}, |
|
{ |
|
"epoch": 1.2700901607212858, |
|
"grad_norm": 5.635436058044434, |
|
"learning_rate": 5.1111111111111115e-06, |
|
"loss": 1.0216, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 1.2720501764014112, |
|
"grad_norm": 6.376223087310791, |
|
"learning_rate": 5.101851851851852e-06, |
|
"loss": 1.0135, |
|
"step": 6490 |
|
}, |
|
{ |
|
"epoch": 1.2740101920815365, |
|
"grad_norm": 6.083188533782959, |
|
"learning_rate": 5.092592592592593e-06, |
|
"loss": 1.0117, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.2759702077616621, |
|
"grad_norm": 5.436441898345947, |
|
"learning_rate": 5.0833333333333335e-06, |
|
"loss": 1.0079, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 1.2779302234417875, |
|
"grad_norm": 5.303137302398682, |
|
"learning_rate": 5.074074074074075e-06, |
|
"loss": 1.02, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 1.279890239121913, |
|
"grad_norm": 5.048837661743164, |
|
"learning_rate": 5.064814814814816e-06, |
|
"loss": 1.0045, |
|
"step": 6530 |
|
}, |
|
{ |
|
"epoch": 1.2818502548020385, |
|
"grad_norm": 6.105396747589111, |
|
"learning_rate": 5.0555555555555555e-06, |
|
"loss": 1.0105, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 1.283810270482164, |
|
"grad_norm": 4.97545051574707, |
|
"learning_rate": 5.046296296296297e-06, |
|
"loss": 1.0128, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 1.2857702861622893, |
|
"grad_norm": 6.359137535095215, |
|
"learning_rate": 5.037037037037037e-06, |
|
"loss": 1.0127, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 1.2877303018424147, |
|
"grad_norm": 6.185931205749512, |
|
"learning_rate": 5.027777777777778e-06, |
|
"loss": 1.0156, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 1.28969031752254, |
|
"grad_norm": 6.333951950073242, |
|
"learning_rate": 5.01851851851852e-06, |
|
"loss": 1.0113, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 1.2916503332026656, |
|
"grad_norm": 5.14535665512085, |
|
"learning_rate": 5.009259259259259e-06, |
|
"loss": 1.0062, |
|
"step": 6590 |
|
}, |
|
{ |
|
"epoch": 1.293610348882791, |
|
"grad_norm": 6.086668491363525, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0123, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.2955703645629164, |
|
"grad_norm": 5.336708068847656, |
|
"learning_rate": 4.990740740740741e-06, |
|
"loss": 1.0058, |
|
"step": 6610 |
|
}, |
|
{ |
|
"epoch": 1.297530380243042, |
|
"grad_norm": 5.65183687210083, |
|
"learning_rate": 4.981481481481482e-06, |
|
"loss": 1.0232, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 1.2994903959231674, |
|
"grad_norm": 5.356630802154541, |
|
"learning_rate": 4.9722222222222224e-06, |
|
"loss": 1.0248, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 1.3014504116032928, |
|
"grad_norm": 5.61469841003418, |
|
"learning_rate": 4.962962962962964e-06, |
|
"loss": 1.0049, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 1.3034104272834184, |
|
"grad_norm": 5.575668811798096, |
|
"learning_rate": 4.953703703703704e-06, |
|
"loss": 1.0046, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 1.3053704429635438, |
|
"grad_norm": 5.938807010650635, |
|
"learning_rate": 4.944444444444445e-06, |
|
"loss": 1.0254, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 1.3073304586436691, |
|
"grad_norm": 5.57589054107666, |
|
"learning_rate": 4.935185185185186e-06, |
|
"loss": 1.016, |
|
"step": 6670 |
|
}, |
|
{ |
|
"epoch": 1.3092904743237945, |
|
"grad_norm": 5.684128284454346, |
|
"learning_rate": 4.925925925925926e-06, |
|
"loss": 1.0102, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 1.31125049000392, |
|
"grad_norm": 5.43093729019165, |
|
"learning_rate": 4.9166666666666665e-06, |
|
"loss": 1.0092, |
|
"step": 6690 |
|
}, |
|
{ |
|
"epoch": 1.3132105056840455, |
|
"grad_norm": 5.595739841461182, |
|
"learning_rate": 4.907407407407408e-06, |
|
"loss": 1.0103, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 1.315170521364171, |
|
"grad_norm": 6.116888046264648, |
|
"learning_rate": 4.898148148148149e-06, |
|
"loss": 1.0177, |
|
"step": 6710 |
|
}, |
|
{ |
|
"epoch": 1.3171305370442963, |
|
"grad_norm": 5.478735446929932, |
|
"learning_rate": 4.888888888888889e-06, |
|
"loss": 1.0147, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 1.3190905527244219, |
|
"grad_norm": 5.878492832183838, |
|
"learning_rate": 4.8796296296296306e-06, |
|
"loss": 1.0189, |
|
"step": 6730 |
|
}, |
|
{ |
|
"epoch": 1.3210505684045473, |
|
"grad_norm": 5.890417098999023, |
|
"learning_rate": 4.870370370370371e-06, |
|
"loss": 1.0246, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 1.3230105840846726, |
|
"grad_norm": 5.2328410148620605, |
|
"learning_rate": 4.861111111111111e-06, |
|
"loss": 1.0116, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 1.3249705997647983, |
|
"grad_norm": 6.1324005126953125, |
|
"learning_rate": 4.851851851851852e-06, |
|
"loss": 1.0028, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 1.3269306154449236, |
|
"grad_norm": 5.392694473266602, |
|
"learning_rate": 4.842592592592593e-06, |
|
"loss": 1.0192, |
|
"step": 6770 |
|
}, |
|
{ |
|
"epoch": 1.328890631125049, |
|
"grad_norm": 5.902957439422607, |
|
"learning_rate": 4.833333333333333e-06, |
|
"loss": 1.0145, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 1.3308506468051744, |
|
"grad_norm": 5.553819179534912, |
|
"learning_rate": 4.824074074074075e-06, |
|
"loss": 1.0091, |
|
"step": 6790 |
|
}, |
|
{ |
|
"epoch": 1.3328106624852998, |
|
"grad_norm": 6.013294219970703, |
|
"learning_rate": 4.814814814814815e-06, |
|
"loss": 1.0198, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 1.3347706781654254, |
|
"grad_norm": 5.731135368347168, |
|
"learning_rate": 4.805555555555556e-06, |
|
"loss": 1.014, |
|
"step": 6810 |
|
}, |
|
{ |
|
"epoch": 1.3367306938455508, |
|
"grad_norm": 5.954074382781982, |
|
"learning_rate": 4.796296296296297e-06, |
|
"loss": 1.0128, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 1.3386907095256761, |
|
"grad_norm": 5.550869464874268, |
|
"learning_rate": 4.787037037037037e-06, |
|
"loss": 1.0151, |
|
"step": 6830 |
|
}, |
|
{ |
|
"epoch": 1.3406507252058018, |
|
"grad_norm": 6.043259620666504, |
|
"learning_rate": 4.777777777777778e-06, |
|
"loss": 1.0069, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 1.3426107408859271, |
|
"grad_norm": 5.002622127532959, |
|
"learning_rate": 4.768518518518519e-06, |
|
"loss": 1.0219, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 1.3445707565660525, |
|
"grad_norm": 6.016299247741699, |
|
"learning_rate": 4.75925925925926e-06, |
|
"loss": 1.0158, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 1.346530772246178, |
|
"grad_norm": 5.6830220222473145, |
|
"learning_rate": 4.75e-06, |
|
"loss": 1.0092, |
|
"step": 6870 |
|
}, |
|
{ |
|
"epoch": 1.3484907879263033, |
|
"grad_norm": 6.092140197753906, |
|
"learning_rate": 4.7407407407407415e-06, |
|
"loss": 1.0241, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 1.3504508036064289, |
|
"grad_norm": 5.837907791137695, |
|
"learning_rate": 4.731481481481482e-06, |
|
"loss": 1.0259, |
|
"step": 6890 |
|
}, |
|
{ |
|
"epoch": 1.3524108192865543, |
|
"grad_norm": 5.598001480102539, |
|
"learning_rate": 4.722222222222222e-06, |
|
"loss": 1.0043, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 1.3543708349666796, |
|
"grad_norm": 5.820865631103516, |
|
"learning_rate": 4.712962962962963e-06, |
|
"loss": 1.0325, |
|
"step": 6910 |
|
}, |
|
{ |
|
"epoch": 1.3563308506468053, |
|
"grad_norm": 6.162250518798828, |
|
"learning_rate": 4.703703703703704e-06, |
|
"loss": 1.0118, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 1.3582908663269306, |
|
"grad_norm": 5.53606653213501, |
|
"learning_rate": 4.694444444444445e-06, |
|
"loss": 1.0099, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 1.360250882007056, |
|
"grad_norm": 6.468924045562744, |
|
"learning_rate": 4.6851851851851855e-06, |
|
"loss": 1.0172, |
|
"step": 6940 |
|
}, |
|
{ |
|
"epoch": 1.3622108976871816, |
|
"grad_norm": 6.867157936096191, |
|
"learning_rate": 4.675925925925927e-06, |
|
"loss": 1.0021, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 1.364170913367307, |
|
"grad_norm": 6.068783760070801, |
|
"learning_rate": 4.666666666666667e-06, |
|
"loss": 1.0083, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 1.3661309290474324, |
|
"grad_norm": 5.502538204193115, |
|
"learning_rate": 4.6574074074074076e-06, |
|
"loss": 1.0173, |
|
"step": 6970 |
|
}, |
|
{ |
|
"epoch": 1.3680909447275578, |
|
"grad_norm": 5.83467960357666, |
|
"learning_rate": 4.648148148148148e-06, |
|
"loss": 1.031, |
|
"step": 6980 |
|
}, |
|
{ |
|
"epoch": 1.3700509604076831, |
|
"grad_norm": 5.383768558502197, |
|
"learning_rate": 4.638888888888889e-06, |
|
"loss": 1.0187, |
|
"step": 6990 |
|
}, |
|
{ |
|
"epoch": 1.3720109760878088, |
|
"grad_norm": 5.172204494476318, |
|
"learning_rate": 4.62962962962963e-06, |
|
"loss": 1.0079, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.3720109760878088, |
|
"eval_loss": 1.0383639335632324, |
|
"eval_runtime": 14.0609, |
|
"eval_samples_per_second": 46.512, |
|
"eval_steps_per_second": 5.832, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.3739709917679341, |
|
"grad_norm": 5.811285495758057, |
|
"learning_rate": 4.620370370370371e-06, |
|
"loss": 1.0153, |
|
"step": 7010 |
|
}, |
|
{ |
|
"epoch": 1.3759310074480595, |
|
"grad_norm": 5.887601375579834, |
|
"learning_rate": 4.611111111111112e-06, |
|
"loss": 1.0104, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 1.3778910231281851, |
|
"grad_norm": 5.245418548583984, |
|
"learning_rate": 4.6018518518518524e-06, |
|
"loss": 1.0063, |
|
"step": 7030 |
|
}, |
|
{ |
|
"epoch": 1.3798510388083105, |
|
"grad_norm": 6.191716194152832, |
|
"learning_rate": 4.592592592592593e-06, |
|
"loss": 1.0254, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 1.3818110544884359, |
|
"grad_norm": 6.047815799713135, |
|
"learning_rate": 4.583333333333333e-06, |
|
"loss": 1.0291, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 1.3837710701685613, |
|
"grad_norm": 5.58579158782959, |
|
"learning_rate": 4.5740740740740745e-06, |
|
"loss": 1.0126, |
|
"step": 7060 |
|
}, |
|
{ |
|
"epoch": 1.3857310858486869, |
|
"grad_norm": 5.651394844055176, |
|
"learning_rate": 4.564814814814815e-06, |
|
"loss": 1.013, |
|
"step": 7070 |
|
}, |
|
{ |
|
"epoch": 1.3876911015288123, |
|
"grad_norm": 5.703254699707031, |
|
"learning_rate": 4.555555555555556e-06, |
|
"loss": 1.0209, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 1.3896511172089376, |
|
"grad_norm": 6.428845405578613, |
|
"learning_rate": 4.5462962962962965e-06, |
|
"loss": 1.0065, |
|
"step": 7090 |
|
}, |
|
{ |
|
"epoch": 1.391611132889063, |
|
"grad_norm": 5.854006767272949, |
|
"learning_rate": 4.537037037037038e-06, |
|
"loss": 1.0192, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 1.3935711485691886, |
|
"grad_norm": 5.257528781890869, |
|
"learning_rate": 4.527777777777778e-06, |
|
"loss": 1.0125, |
|
"step": 7110 |
|
}, |
|
{ |
|
"epoch": 1.395531164249314, |
|
"grad_norm": 5.832418441772461, |
|
"learning_rate": 4.5185185185185185e-06, |
|
"loss": 1.0077, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 1.3974911799294394, |
|
"grad_norm": 18.90093994140625, |
|
"learning_rate": 4.50925925925926e-06, |
|
"loss": 1.0039, |
|
"step": 7130 |
|
}, |
|
{ |
|
"epoch": 1.399451195609565, |
|
"grad_norm": 6.230194568634033, |
|
"learning_rate": 4.5e-06, |
|
"loss": 1.0168, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 1.4014112112896904, |
|
"grad_norm": 5.916494846343994, |
|
"learning_rate": 4.490740740740741e-06, |
|
"loss": 1.0367, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 1.4033712269698158, |
|
"grad_norm": 6.498172760009766, |
|
"learning_rate": 4.481481481481482e-06, |
|
"loss": 1.0226, |
|
"step": 7160 |
|
}, |
|
{ |
|
"epoch": 1.4053312426499411, |
|
"grad_norm": 5.681482791900635, |
|
"learning_rate": 4.472222222222223e-06, |
|
"loss": 1.0135, |
|
"step": 7170 |
|
}, |
|
{ |
|
"epoch": 1.4072912583300665, |
|
"grad_norm": 5.3359880447387695, |
|
"learning_rate": 4.462962962962963e-06, |
|
"loss": 1.0222, |
|
"step": 7180 |
|
}, |
|
{ |
|
"epoch": 1.4092512740101921, |
|
"grad_norm": 6.866293430328369, |
|
"learning_rate": 4.453703703703704e-06, |
|
"loss": 1.015, |
|
"step": 7190 |
|
}, |
|
{ |
|
"epoch": 1.4112112896903175, |
|
"grad_norm": 6.149731159210205, |
|
"learning_rate": 4.444444444444444e-06, |
|
"loss": 0.998, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 1.4131713053704429, |
|
"grad_norm": 5.832232475280762, |
|
"learning_rate": 4.435185185185185e-06, |
|
"loss": 1.0097, |
|
"step": 7210 |
|
}, |
|
{ |
|
"epoch": 1.4151313210505685, |
|
"grad_norm": 5.8602986335754395, |
|
"learning_rate": 4.425925925925927e-06, |
|
"loss": 1.0133, |
|
"step": 7220 |
|
}, |
|
{ |
|
"epoch": 1.4170913367306939, |
|
"grad_norm": 6.716607093811035, |
|
"learning_rate": 4.416666666666667e-06, |
|
"loss": 1.0181, |
|
"step": 7230 |
|
}, |
|
{ |
|
"epoch": 1.4190513524108193, |
|
"grad_norm": 5.983255863189697, |
|
"learning_rate": 4.407407407407408e-06, |
|
"loss": 1.0152, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 1.4210113680909449, |
|
"grad_norm": 5.979559898376465, |
|
"learning_rate": 4.398148148148149e-06, |
|
"loss": 1.0187, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 1.4229713837710702, |
|
"grad_norm": 5.463419437408447, |
|
"learning_rate": 4.388888888888889e-06, |
|
"loss": 1.0018, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 1.4249313994511956, |
|
"grad_norm": 5.58013916015625, |
|
"learning_rate": 4.379629629629629e-06, |
|
"loss": 1.0161, |
|
"step": 7270 |
|
}, |
|
{ |
|
"epoch": 1.426891415131321, |
|
"grad_norm": 5.896617412567139, |
|
"learning_rate": 4.370370370370371e-06, |
|
"loss": 1.0234, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 1.4288514308114464, |
|
"grad_norm": 6.59088659286499, |
|
"learning_rate": 4.361111111111112e-06, |
|
"loss": 1.0116, |
|
"step": 7290 |
|
}, |
|
{ |
|
"epoch": 1.430811446491572, |
|
"grad_norm": 5.4909348487854, |
|
"learning_rate": 4.351851851851852e-06, |
|
"loss": 1.0241, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 1.4327714621716974, |
|
"grad_norm": 5.706267356872559, |
|
"learning_rate": 4.342592592592593e-06, |
|
"loss": 1.0113, |
|
"step": 7310 |
|
}, |
|
{ |
|
"epoch": 1.4347314778518228, |
|
"grad_norm": 5.893515586853027, |
|
"learning_rate": 4.333333333333334e-06, |
|
"loss": 1.0074, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 1.4366914935319484, |
|
"grad_norm": 5.337756156921387, |
|
"learning_rate": 4.324074074074074e-06, |
|
"loss": 1.0086, |
|
"step": 7330 |
|
}, |
|
{ |
|
"epoch": 1.4386515092120737, |
|
"grad_norm": 6.063902854919434, |
|
"learning_rate": 4.314814814814815e-06, |
|
"loss": 1.0135, |
|
"step": 7340 |
|
}, |
|
{ |
|
"epoch": 1.4406115248921991, |
|
"grad_norm": 5.862417221069336, |
|
"learning_rate": 4.305555555555556e-06, |
|
"loss": 1.0218, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 1.4425715405723245, |
|
"grad_norm": 6.047614574432373, |
|
"learning_rate": 4.296296296296296e-06, |
|
"loss": 1.0147, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 1.4445315562524499, |
|
"grad_norm": 7.018215656280518, |
|
"learning_rate": 4.2870370370370376e-06, |
|
"loss": 1.029, |
|
"step": 7370 |
|
}, |
|
{ |
|
"epoch": 1.4464915719325755, |
|
"grad_norm": 5.391064167022705, |
|
"learning_rate": 4.277777777777778e-06, |
|
"loss": 1.007, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 1.4484515876127009, |
|
"grad_norm": 5.364874362945557, |
|
"learning_rate": 4.268518518518519e-06, |
|
"loss": 1.0188, |
|
"step": 7390 |
|
}, |
|
{ |
|
"epoch": 1.4504116032928263, |
|
"grad_norm": 5.990734577178955, |
|
"learning_rate": 4.2592592592592596e-06, |
|
"loss": 1.0003, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 1.4523716189729519, |
|
"grad_norm": 5.727319717407227, |
|
"learning_rate": 4.25e-06, |
|
"loss": 1.0198, |
|
"step": 7410 |
|
}, |
|
{ |
|
"epoch": 1.4543316346530772, |
|
"grad_norm": 5.658199310302734, |
|
"learning_rate": 4.240740740740741e-06, |
|
"loss": 1.0221, |
|
"step": 7420 |
|
}, |
|
{ |
|
"epoch": 1.4562916503332026, |
|
"grad_norm": 5.460375785827637, |
|
"learning_rate": 4.231481481481482e-06, |
|
"loss": 1.0127, |
|
"step": 7430 |
|
}, |
|
{ |
|
"epoch": 1.4582516660133282, |
|
"grad_norm": 6.161991596221924, |
|
"learning_rate": 4.222222222222223e-06, |
|
"loss": 1.0025, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 1.4602116816934536, |
|
"grad_norm": 5.7905192375183105, |
|
"learning_rate": 4.212962962962963e-06, |
|
"loss": 1.0071, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 1.462171697373579, |
|
"grad_norm": 5.652228832244873, |
|
"learning_rate": 4.2037037037037045e-06, |
|
"loss": 1.0112, |
|
"step": 7460 |
|
}, |
|
{ |
|
"epoch": 1.4641317130537044, |
|
"grad_norm": 5.817644119262695, |
|
"learning_rate": 4.194444444444445e-06, |
|
"loss": 1.0191, |
|
"step": 7470 |
|
}, |
|
{ |
|
"epoch": 1.4660917287338298, |
|
"grad_norm": 6.155873775482178, |
|
"learning_rate": 4.185185185185185e-06, |
|
"loss": 1.013, |
|
"step": 7480 |
|
}, |
|
{ |
|
"epoch": 1.4680517444139554, |
|
"grad_norm": 5.775312423706055, |
|
"learning_rate": 4.175925925925926e-06, |
|
"loss": 1.0107, |
|
"step": 7490 |
|
}, |
|
{ |
|
"epoch": 1.4700117600940807, |
|
"grad_norm": 5.765095233917236, |
|
"learning_rate": 4.166666666666667e-06, |
|
"loss": 1.0151, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.4719717757742061, |
|
"grad_norm": 5.336995601654053, |
|
"learning_rate": 4.157407407407408e-06, |
|
"loss": 1.0048, |
|
"step": 7510 |
|
}, |
|
{ |
|
"epoch": 1.4739317914543317, |
|
"grad_norm": 6.261089324951172, |
|
"learning_rate": 4.1481481481481485e-06, |
|
"loss": 1.0002, |
|
"step": 7520 |
|
}, |
|
{ |
|
"epoch": 1.4758918071344571, |
|
"grad_norm": 5.8046746253967285, |
|
"learning_rate": 4.138888888888889e-06, |
|
"loss": 1.0079, |
|
"step": 7530 |
|
}, |
|
{ |
|
"epoch": 1.4778518228145825, |
|
"grad_norm": 5.5930867195129395, |
|
"learning_rate": 4.12962962962963e-06, |
|
"loss": 1.0094, |
|
"step": 7540 |
|
}, |
|
{ |
|
"epoch": 1.4798118384947079, |
|
"grad_norm": 5.929795742034912, |
|
"learning_rate": 4.1203703703703705e-06, |
|
"loss": 1.0122, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 1.4817718541748335, |
|
"grad_norm": 5.746526718139648, |
|
"learning_rate": 4.111111111111111e-06, |
|
"loss": 1.0186, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 1.4837318698549589, |
|
"grad_norm": 6.316400051116943, |
|
"learning_rate": 4.101851851851852e-06, |
|
"loss": 1.0019, |
|
"step": 7570 |
|
}, |
|
{ |
|
"epoch": 1.4856918855350842, |
|
"grad_norm": 5.561999320983887, |
|
"learning_rate": 4.092592592592593e-06, |
|
"loss": 1.0157, |
|
"step": 7580 |
|
}, |
|
{ |
|
"epoch": 1.4876519012152096, |
|
"grad_norm": 6.012383937835693, |
|
"learning_rate": 4.083333333333334e-06, |
|
"loss": 1.0108, |
|
"step": 7590 |
|
}, |
|
{ |
|
"epoch": 1.4896119168953352, |
|
"grad_norm": 5.594381809234619, |
|
"learning_rate": 4.074074074074074e-06, |
|
"loss": 0.9917, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 1.4915719325754606, |
|
"grad_norm": 6.158176898956299, |
|
"learning_rate": 4.064814814814815e-06, |
|
"loss": 1.0063, |
|
"step": 7610 |
|
}, |
|
{ |
|
"epoch": 1.493531948255586, |
|
"grad_norm": 5.4881510734558105, |
|
"learning_rate": 4.055555555555556e-06, |
|
"loss": 1.0122, |
|
"step": 7620 |
|
}, |
|
{ |
|
"epoch": 1.4954919639357116, |
|
"grad_norm": 6.7185869216918945, |
|
"learning_rate": 4.046296296296296e-06, |
|
"loss": 0.9927, |
|
"step": 7630 |
|
}, |
|
{ |
|
"epoch": 1.497451979615837, |
|
"grad_norm": 6.015646934509277, |
|
"learning_rate": 4.037037037037037e-06, |
|
"loss": 1.0054, |
|
"step": 7640 |
|
}, |
|
{ |
|
"epoch": 1.4994119952959624, |
|
"grad_norm": 5.855646133422852, |
|
"learning_rate": 4.027777777777779e-06, |
|
"loss": 0.9988, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 1.501372010976088, |
|
"grad_norm": 6.158780574798584, |
|
"learning_rate": 4.018518518518519e-06, |
|
"loss": 1.027, |
|
"step": 7660 |
|
}, |
|
{ |
|
"epoch": 1.5033320266562131, |
|
"grad_norm": 5.943362712860107, |
|
"learning_rate": 4.0092592592592594e-06, |
|
"loss": 1.0285, |
|
"step": 7670 |
|
}, |
|
{ |
|
"epoch": 1.5052920423363387, |
|
"grad_norm": 5.775484085083008, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.0093, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 1.5072520580164641, |
|
"grad_norm": 6.551724910736084, |
|
"learning_rate": 3.990740740740741e-06, |
|
"loss": 1.0185, |
|
"step": 7690 |
|
}, |
|
{ |
|
"epoch": 1.5092120736965895, |
|
"grad_norm": 5.986613750457764, |
|
"learning_rate": 3.9814814814814814e-06, |
|
"loss": 1.0058, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 1.5092120736965895, |
|
"eval_loss": 1.0326706171035767, |
|
"eval_runtime": 13.9953, |
|
"eval_samples_per_second": 46.73, |
|
"eval_steps_per_second": 5.859, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 1.511172089376715, |
|
"grad_norm": 5.968577861785889, |
|
"learning_rate": 3.972222222222223e-06, |
|
"loss": 1.0111, |
|
"step": 7710 |
|
}, |
|
{ |
|
"epoch": 1.5131321050568405, |
|
"grad_norm": 5.956306457519531, |
|
"learning_rate": 3.962962962962963e-06, |
|
"loss": 0.9822, |
|
"step": 7720 |
|
}, |
|
{ |
|
"epoch": 1.5150921207369659, |
|
"grad_norm": 6.391535758972168, |
|
"learning_rate": 3.953703703703704e-06, |
|
"loss": 1.0073, |
|
"step": 7730 |
|
}, |
|
{ |
|
"epoch": 1.5170521364170915, |
|
"grad_norm": 6.031282901763916, |
|
"learning_rate": 3.944444444444445e-06, |
|
"loss": 0.9992, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 1.5190121520972166, |
|
"grad_norm": 6.070298194885254, |
|
"learning_rate": 3.935185185185186e-06, |
|
"loss": 0.9995, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 1.5209721677773422, |
|
"grad_norm": 6.470989227294922, |
|
"learning_rate": 3.925925925925926e-06, |
|
"loss": 1.0041, |
|
"step": 7760 |
|
}, |
|
{ |
|
"epoch": 1.5229321834574676, |
|
"grad_norm": 6.339069843292236, |
|
"learning_rate": 3.916666666666667e-06, |
|
"loss": 1.0145, |
|
"step": 7770 |
|
}, |
|
{ |
|
"epoch": 1.524892199137593, |
|
"grad_norm": 6.204465866088867, |
|
"learning_rate": 3.907407407407408e-06, |
|
"loss": 1.0202, |
|
"step": 7780 |
|
}, |
|
{ |
|
"epoch": 1.5268522148177186, |
|
"grad_norm": 5.39393424987793, |
|
"learning_rate": 3.898148148148148e-06, |
|
"loss": 1.0144, |
|
"step": 7790 |
|
}, |
|
{ |
|
"epoch": 1.528812230497844, |
|
"grad_norm": 6.083856105804443, |
|
"learning_rate": 3.88888888888889e-06, |
|
"loss": 1.0087, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 1.5307722461779694, |
|
"grad_norm": 6.141456604003906, |
|
"learning_rate": 3.87962962962963e-06, |
|
"loss": 1.0189, |
|
"step": 7810 |
|
}, |
|
{ |
|
"epoch": 1.532732261858095, |
|
"grad_norm": 6.509696960449219, |
|
"learning_rate": 3.87037037037037e-06, |
|
"loss": 1.0136, |
|
"step": 7820 |
|
}, |
|
{ |
|
"epoch": 1.5346922775382204, |
|
"grad_norm": 5.372650146484375, |
|
"learning_rate": 3.861111111111112e-06, |
|
"loss": 1.0009, |
|
"step": 7830 |
|
}, |
|
{ |
|
"epoch": 1.5366522932183457, |
|
"grad_norm": 5.931579113006592, |
|
"learning_rate": 3.851851851851852e-06, |
|
"loss": 1.0144, |
|
"step": 7840 |
|
}, |
|
{ |
|
"epoch": 1.5386123088984713, |
|
"grad_norm": 6.2442626953125, |
|
"learning_rate": 3.842592592592592e-06, |
|
"loss": 1.0132, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 1.5405723245785965, |
|
"grad_norm": 6.475978851318359, |
|
"learning_rate": 3.833333333333334e-06, |
|
"loss": 1.0058, |
|
"step": 7860 |
|
}, |
|
{ |
|
"epoch": 1.542532340258722, |
|
"grad_norm": 5.5199103355407715, |
|
"learning_rate": 3.824074074074075e-06, |
|
"loss": 1.0017, |
|
"step": 7870 |
|
}, |
|
{ |
|
"epoch": 1.5444923559388475, |
|
"grad_norm": 5.857003211975098, |
|
"learning_rate": 3.814814814814815e-06, |
|
"loss": 1.0108, |
|
"step": 7880 |
|
}, |
|
{ |
|
"epoch": 1.5464523716189729, |
|
"grad_norm": 5.81916618347168, |
|
"learning_rate": 3.8055555555555556e-06, |
|
"loss": 1.0185, |
|
"step": 7890 |
|
}, |
|
{ |
|
"epoch": 1.5484123872990985, |
|
"grad_norm": 6.099959850311279, |
|
"learning_rate": 3.796296296296297e-06, |
|
"loss": 1.0109, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 1.5503724029792239, |
|
"grad_norm": 5.837829113006592, |
|
"learning_rate": 3.7870370370370373e-06, |
|
"loss": 1.0004, |
|
"step": 7910 |
|
}, |
|
{ |
|
"epoch": 1.5523324186593492, |
|
"grad_norm": 6.274796485900879, |
|
"learning_rate": 3.777777777777778e-06, |
|
"loss": 1.0165, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 1.5542924343394748, |
|
"grad_norm": 5.926800727844238, |
|
"learning_rate": 3.7685185185185185e-06, |
|
"loss": 1.0067, |
|
"step": 7930 |
|
}, |
|
{ |
|
"epoch": 1.5562524500196, |
|
"grad_norm": 6.2746381759643555, |
|
"learning_rate": 3.7592592592592597e-06, |
|
"loss": 1.0078, |
|
"step": 7940 |
|
}, |
|
{ |
|
"epoch": 1.5582124656997256, |
|
"grad_norm": 6.315319538116455, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 1.0085, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 1.560172481379851, |
|
"grad_norm": 6.36542272567749, |
|
"learning_rate": 3.740740740740741e-06, |
|
"loss": 1.0073, |
|
"step": 7960 |
|
}, |
|
{ |
|
"epoch": 1.5621324970599764, |
|
"grad_norm": 6.047231197357178, |
|
"learning_rate": 3.731481481481482e-06, |
|
"loss": 1.0019, |
|
"step": 7970 |
|
}, |
|
{ |
|
"epoch": 1.564092512740102, |
|
"grad_norm": 6.49619722366333, |
|
"learning_rate": 3.7222222222222225e-06, |
|
"loss": 1.0126, |
|
"step": 7980 |
|
}, |
|
{ |
|
"epoch": 1.5660525284202274, |
|
"grad_norm": 6.945578098297119, |
|
"learning_rate": 3.7129629629629633e-06, |
|
"loss": 1.0141, |
|
"step": 7990 |
|
}, |
|
{ |
|
"epoch": 1.5680125441003527, |
|
"grad_norm": 5.595407485961914, |
|
"learning_rate": 3.7037037037037037e-06, |
|
"loss": 1.0077, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.5699725597804783, |
|
"grad_norm": 6.3146867752075195, |
|
"learning_rate": 3.694444444444445e-06, |
|
"loss": 1.0178, |
|
"step": 8010 |
|
}, |
|
{ |
|
"epoch": 1.5719325754606037, |
|
"grad_norm": 8.867284774780273, |
|
"learning_rate": 3.6851851851851854e-06, |
|
"loss": 0.9974, |
|
"step": 8020 |
|
}, |
|
{ |
|
"epoch": 1.573892591140729, |
|
"grad_norm": 5.997375011444092, |
|
"learning_rate": 3.675925925925926e-06, |
|
"loss": 1.0169, |
|
"step": 8030 |
|
}, |
|
{ |
|
"epoch": 1.5758526068208547, |
|
"grad_norm": 6.150747776031494, |
|
"learning_rate": 3.6666666666666666e-06, |
|
"loss": 1.0087, |
|
"step": 8040 |
|
}, |
|
{ |
|
"epoch": 1.5778126225009799, |
|
"grad_norm": 6.344914436340332, |
|
"learning_rate": 3.657407407407408e-06, |
|
"loss": 1.008, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 1.5797726381811055, |
|
"grad_norm": 5.457338333129883, |
|
"learning_rate": 3.6481481481481486e-06, |
|
"loss": 1.0169, |
|
"step": 8060 |
|
}, |
|
{ |
|
"epoch": 1.5817326538612309, |
|
"grad_norm": 6.128787994384766, |
|
"learning_rate": 3.638888888888889e-06, |
|
"loss": 1.0104, |
|
"step": 8070 |
|
}, |
|
{ |
|
"epoch": 1.5836926695413562, |
|
"grad_norm": 6.624429702758789, |
|
"learning_rate": 3.6296296296296302e-06, |
|
"loss": 0.9982, |
|
"step": 8080 |
|
}, |
|
{ |
|
"epoch": 1.5856526852214818, |
|
"grad_norm": 5.785494327545166, |
|
"learning_rate": 3.6203703703703706e-06, |
|
"loss": 0.9968, |
|
"step": 8090 |
|
}, |
|
{ |
|
"epoch": 1.5876127009016072, |
|
"grad_norm": 6.378090858459473, |
|
"learning_rate": 3.6111111111111115e-06, |
|
"loss": 0.9972, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 1.5895727165817326, |
|
"grad_norm": 7.523343086242676, |
|
"learning_rate": 3.601851851851852e-06, |
|
"loss": 1.0077, |
|
"step": 8110 |
|
}, |
|
{ |
|
"epoch": 1.5915327322618582, |
|
"grad_norm": 5.578686714172363, |
|
"learning_rate": 3.592592592592593e-06, |
|
"loss": 1.0103, |
|
"step": 8120 |
|
}, |
|
{ |
|
"epoch": 1.5934927479419836, |
|
"grad_norm": 6.031825065612793, |
|
"learning_rate": 3.5833333333333335e-06, |
|
"loss": 1.015, |
|
"step": 8130 |
|
}, |
|
{ |
|
"epoch": 1.595452763622109, |
|
"grad_norm": 6.009352207183838, |
|
"learning_rate": 3.5740740740740743e-06, |
|
"loss": 0.9997, |
|
"step": 8140 |
|
}, |
|
{ |
|
"epoch": 1.5974127793022346, |
|
"grad_norm": 6.1863694190979, |
|
"learning_rate": 3.5648148148148147e-06, |
|
"loss": 0.9902, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 1.5993727949823597, |
|
"grad_norm": 5.477410316467285, |
|
"learning_rate": 3.555555555555556e-06, |
|
"loss": 1.0077, |
|
"step": 8160 |
|
}, |
|
{ |
|
"epoch": 1.6013328106624853, |
|
"grad_norm": 6.446470260620117, |
|
"learning_rate": 3.5462962962962967e-06, |
|
"loss": 1.0123, |
|
"step": 8170 |
|
}, |
|
{ |
|
"epoch": 1.6032928263426107, |
|
"grad_norm": 5.979601860046387, |
|
"learning_rate": 3.537037037037037e-06, |
|
"loss": 1.0106, |
|
"step": 8180 |
|
}, |
|
{ |
|
"epoch": 1.605252842022736, |
|
"grad_norm": 5.748382091522217, |
|
"learning_rate": 3.5277777777777784e-06, |
|
"loss": 1.0083, |
|
"step": 8190 |
|
}, |
|
{ |
|
"epoch": 1.6072128577028617, |
|
"grad_norm": 6.122110843658447, |
|
"learning_rate": 3.5185185185185187e-06, |
|
"loss": 1.0124, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 1.609172873382987, |
|
"grad_norm": 6.104898452758789, |
|
"learning_rate": 3.5092592592592596e-06, |
|
"loss": 1.0118, |
|
"step": 8210 |
|
}, |
|
{ |
|
"epoch": 1.6111328890631125, |
|
"grad_norm": 6.527801513671875, |
|
"learning_rate": 3.5e-06, |
|
"loss": 1.0084, |
|
"step": 8220 |
|
}, |
|
{ |
|
"epoch": 1.613092904743238, |
|
"grad_norm": 6.3016676902771, |
|
"learning_rate": 3.490740740740741e-06, |
|
"loss": 1.0104, |
|
"step": 8230 |
|
}, |
|
{ |
|
"epoch": 1.6150529204233632, |
|
"grad_norm": 6.44483757019043, |
|
"learning_rate": 3.481481481481482e-06, |
|
"loss": 1.0102, |
|
"step": 8240 |
|
}, |
|
{ |
|
"epoch": 1.6170129361034888, |
|
"grad_norm": 6.052326679229736, |
|
"learning_rate": 3.4722222222222224e-06, |
|
"loss": 1.0147, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 1.6189729517836142, |
|
"grad_norm": 5.94256067276001, |
|
"learning_rate": 3.4629629629629628e-06, |
|
"loss": 1.0108, |
|
"step": 8260 |
|
}, |
|
{ |
|
"epoch": 1.6209329674637396, |
|
"grad_norm": 6.634081840515137, |
|
"learning_rate": 3.453703703703704e-06, |
|
"loss": 1.0139, |
|
"step": 8270 |
|
}, |
|
{ |
|
"epoch": 1.6228929831438652, |
|
"grad_norm": 6.239657878875732, |
|
"learning_rate": 3.444444444444445e-06, |
|
"loss": 0.9949, |
|
"step": 8280 |
|
}, |
|
{ |
|
"epoch": 1.6248529988239906, |
|
"grad_norm": 6.047982692718506, |
|
"learning_rate": 3.4351851851851852e-06, |
|
"loss": 1.0223, |
|
"step": 8290 |
|
}, |
|
{ |
|
"epoch": 1.626813014504116, |
|
"grad_norm": 5.618074417114258, |
|
"learning_rate": 3.4259259259259265e-06, |
|
"loss": 1.022, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 1.6287730301842416, |
|
"grad_norm": 5.856754779815674, |
|
"learning_rate": 3.416666666666667e-06, |
|
"loss": 1.0053, |
|
"step": 8310 |
|
}, |
|
{ |
|
"epoch": 1.630733045864367, |
|
"grad_norm": 6.435812473297119, |
|
"learning_rate": 3.4074074074074077e-06, |
|
"loss": 0.9989, |
|
"step": 8320 |
|
}, |
|
{ |
|
"epoch": 1.6326930615444923, |
|
"grad_norm": 5.980440139770508, |
|
"learning_rate": 3.398148148148148e-06, |
|
"loss": 1.0108, |
|
"step": 8330 |
|
}, |
|
{ |
|
"epoch": 1.634653077224618, |
|
"grad_norm": 6.19628381729126, |
|
"learning_rate": 3.3888888888888893e-06, |
|
"loss": 0.995, |
|
"step": 8340 |
|
}, |
|
{ |
|
"epoch": 1.636613092904743, |
|
"grad_norm": 5.549736976623535, |
|
"learning_rate": 3.37962962962963e-06, |
|
"loss": 1.0181, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 1.6385731085848687, |
|
"grad_norm": 6.485236644744873, |
|
"learning_rate": 3.3703703703703705e-06, |
|
"loss": 1.0092, |
|
"step": 8360 |
|
}, |
|
{ |
|
"epoch": 1.640533124264994, |
|
"grad_norm": 6.0221848487854, |
|
"learning_rate": 3.3611111111111117e-06, |
|
"loss": 1.0038, |
|
"step": 8370 |
|
}, |
|
{ |
|
"epoch": 1.6424931399451195, |
|
"grad_norm": 5.699035167694092, |
|
"learning_rate": 3.351851851851852e-06, |
|
"loss": 1.0092, |
|
"step": 8380 |
|
}, |
|
{ |
|
"epoch": 1.644453155625245, |
|
"grad_norm": 6.401322364807129, |
|
"learning_rate": 3.342592592592593e-06, |
|
"loss": 1.0013, |
|
"step": 8390 |
|
}, |
|
{ |
|
"epoch": 1.6464131713053705, |
|
"grad_norm": 6.202515125274658, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 1.0165, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 1.6464131713053705, |
|
"eval_loss": 1.0264891386032104, |
|
"eval_runtime": 14.05, |
|
"eval_samples_per_second": 46.548, |
|
"eval_steps_per_second": 5.836, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 1.6483731869854958, |
|
"grad_norm": 6.184670448303223, |
|
"learning_rate": 3.3240740740740746e-06, |
|
"loss": 1.0055, |
|
"step": 8410 |
|
}, |
|
{ |
|
"epoch": 1.6503332026656214, |
|
"grad_norm": 6.040178298950195, |
|
"learning_rate": 3.314814814814815e-06, |
|
"loss": 1.007, |
|
"step": 8420 |
|
}, |
|
{ |
|
"epoch": 1.6522932183457466, |
|
"grad_norm": 6.925061225891113, |
|
"learning_rate": 3.3055555555555558e-06, |
|
"loss": 1.0101, |
|
"step": 8430 |
|
}, |
|
{ |
|
"epoch": 1.6542532340258722, |
|
"grad_norm": 6.050986289978027, |
|
"learning_rate": 3.296296296296296e-06, |
|
"loss": 0.9969, |
|
"step": 8440 |
|
}, |
|
{ |
|
"epoch": 1.6562132497059978, |
|
"grad_norm": 5.321831703186035, |
|
"learning_rate": 3.2870370370370374e-06, |
|
"loss": 1.0092, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 1.658173265386123, |
|
"grad_norm": 5.440800666809082, |
|
"learning_rate": 3.277777777777778e-06, |
|
"loss": 1.0115, |
|
"step": 8460 |
|
}, |
|
{ |
|
"epoch": 1.6601332810662486, |
|
"grad_norm": 8.311331748962402, |
|
"learning_rate": 3.2685185185185186e-06, |
|
"loss": 1.0014, |
|
"step": 8470 |
|
}, |
|
{ |
|
"epoch": 1.662093296746374, |
|
"grad_norm": 6.014708518981934, |
|
"learning_rate": 3.25925925925926e-06, |
|
"loss": 1.0135, |
|
"step": 8480 |
|
}, |
|
{ |
|
"epoch": 1.6640533124264993, |
|
"grad_norm": 6.3742475509643555, |
|
"learning_rate": 3.2500000000000002e-06, |
|
"loss": 1.0024, |
|
"step": 8490 |
|
}, |
|
{ |
|
"epoch": 1.666013328106625, |
|
"grad_norm": 6.1811041831970215, |
|
"learning_rate": 3.240740740740741e-06, |
|
"loss": 0.9983, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.6679733437867503, |
|
"grad_norm": 6.055778503417969, |
|
"learning_rate": 3.2314814814814814e-06, |
|
"loss": 0.9986, |
|
"step": 8510 |
|
}, |
|
{ |
|
"epoch": 1.6699333594668757, |
|
"grad_norm": 6.654355525970459, |
|
"learning_rate": 3.2222222222222227e-06, |
|
"loss": 1.0111, |
|
"step": 8520 |
|
}, |
|
{ |
|
"epoch": 1.6718933751470013, |
|
"grad_norm": 6.156193733215332, |
|
"learning_rate": 3.2129629629629635e-06, |
|
"loss": 0.9879, |
|
"step": 8530 |
|
}, |
|
{ |
|
"epoch": 1.6738533908271265, |
|
"grad_norm": 6.491064548492432, |
|
"learning_rate": 3.203703703703704e-06, |
|
"loss": 1.0197, |
|
"step": 8540 |
|
}, |
|
{ |
|
"epoch": 1.675813406507252, |
|
"grad_norm": 5.876689434051514, |
|
"learning_rate": 3.1944444444444443e-06, |
|
"loss": 0.9934, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 1.6777734221873775, |
|
"grad_norm": 6.113574504852295, |
|
"learning_rate": 3.1851851851851855e-06, |
|
"loss": 0.9992, |
|
"step": 8560 |
|
}, |
|
{ |
|
"epoch": 1.6797334378675028, |
|
"grad_norm": 6.2524518966674805, |
|
"learning_rate": 3.1759259259259263e-06, |
|
"loss": 1.0101, |
|
"step": 8570 |
|
}, |
|
{ |
|
"epoch": 1.6816934535476284, |
|
"grad_norm": 6.034154891967773, |
|
"learning_rate": 3.1666666666666667e-06, |
|
"loss": 1.0009, |
|
"step": 8580 |
|
}, |
|
{ |
|
"epoch": 1.6836534692277538, |
|
"grad_norm": 5.920796871185303, |
|
"learning_rate": 3.157407407407408e-06, |
|
"loss": 1.0088, |
|
"step": 8590 |
|
}, |
|
{ |
|
"epoch": 1.6856134849078792, |
|
"grad_norm": 5.578927993774414, |
|
"learning_rate": 3.1481481481481483e-06, |
|
"loss": 1.0004, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 1.6875735005880048, |
|
"grad_norm": 6.1217803955078125, |
|
"learning_rate": 3.138888888888889e-06, |
|
"loss": 1.0034, |
|
"step": 8610 |
|
}, |
|
{ |
|
"epoch": 1.6895335162681302, |
|
"grad_norm": 6.209445953369141, |
|
"learning_rate": 3.1296296296296295e-06, |
|
"loss": 0.9912, |
|
"step": 8620 |
|
}, |
|
{ |
|
"epoch": 1.6914935319482556, |
|
"grad_norm": 6.419200897216797, |
|
"learning_rate": 3.1203703703703708e-06, |
|
"loss": 1.0048, |
|
"step": 8630 |
|
}, |
|
{ |
|
"epoch": 1.6934535476283812, |
|
"grad_norm": 6.344346046447754, |
|
"learning_rate": 3.1111111111111116e-06, |
|
"loss": 0.9986, |
|
"step": 8640 |
|
}, |
|
{ |
|
"epoch": 1.6954135633085063, |
|
"grad_norm": 6.1742963790893555, |
|
"learning_rate": 3.101851851851852e-06, |
|
"loss": 1.0085, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 1.697373578988632, |
|
"grad_norm": 5.269043445587158, |
|
"learning_rate": 3.0925925925925928e-06, |
|
"loss": 1.0045, |
|
"step": 8660 |
|
}, |
|
{ |
|
"epoch": 1.6993335946687573, |
|
"grad_norm": 6.707599639892578, |
|
"learning_rate": 3.0833333333333336e-06, |
|
"loss": 1.0074, |
|
"step": 8670 |
|
}, |
|
{ |
|
"epoch": 1.7012936103488827, |
|
"grad_norm": 6.476319313049316, |
|
"learning_rate": 3.0740740740740744e-06, |
|
"loss": 1.0066, |
|
"step": 8680 |
|
}, |
|
{ |
|
"epoch": 1.7032536260290083, |
|
"grad_norm": 5.808709621429443, |
|
"learning_rate": 3.064814814814815e-06, |
|
"loss": 0.9969, |
|
"step": 8690 |
|
}, |
|
{ |
|
"epoch": 1.7052136417091337, |
|
"grad_norm": 10.500265121459961, |
|
"learning_rate": 3.055555555555556e-06, |
|
"loss": 1.0062, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 1.707173657389259, |
|
"grad_norm": 5.992859840393066, |
|
"learning_rate": 3.0462962962962964e-06, |
|
"loss": 1.0133, |
|
"step": 8710 |
|
}, |
|
{ |
|
"epoch": 1.7091336730693847, |
|
"grad_norm": 6.0215325355529785, |
|
"learning_rate": 3.0370370370370372e-06, |
|
"loss": 1.0175, |
|
"step": 8720 |
|
}, |
|
{ |
|
"epoch": 1.7110936887495098, |
|
"grad_norm": 6.671189785003662, |
|
"learning_rate": 3.0277777777777776e-06, |
|
"loss": 1.0113, |
|
"step": 8730 |
|
}, |
|
{ |
|
"epoch": 1.7130537044296354, |
|
"grad_norm": 6.012123107910156, |
|
"learning_rate": 3.018518518518519e-06, |
|
"loss": 0.9931, |
|
"step": 8740 |
|
}, |
|
{ |
|
"epoch": 1.7150137201097608, |
|
"grad_norm": 5.328975200653076, |
|
"learning_rate": 3.0092592592592597e-06, |
|
"loss": 1.0052, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 1.7169737357898862, |
|
"grad_norm": 6.184416770935059, |
|
"learning_rate": 3e-06, |
|
"loss": 1.0125, |
|
"step": 8760 |
|
}, |
|
{ |
|
"epoch": 1.7189337514700118, |
|
"grad_norm": 6.222989559173584, |
|
"learning_rate": 2.990740740740741e-06, |
|
"loss": 1.0166, |
|
"step": 8770 |
|
}, |
|
{ |
|
"epoch": 1.7208937671501372, |
|
"grad_norm": 6.706179141998291, |
|
"learning_rate": 2.9814814814814817e-06, |
|
"loss": 1.0095, |
|
"step": 8780 |
|
}, |
|
{ |
|
"epoch": 1.7228537828302626, |
|
"grad_norm": 6.823479652404785, |
|
"learning_rate": 2.9722222222222225e-06, |
|
"loss": 1.0147, |
|
"step": 8790 |
|
}, |
|
{ |
|
"epoch": 1.7248137985103882, |
|
"grad_norm": 7.233177185058594, |
|
"learning_rate": 2.962962962962963e-06, |
|
"loss": 0.9963, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 1.7267738141905136, |
|
"grad_norm": 5.568224906921387, |
|
"learning_rate": 2.953703703703704e-06, |
|
"loss": 1.0052, |
|
"step": 8810 |
|
}, |
|
{ |
|
"epoch": 1.728733829870639, |
|
"grad_norm": 6.352148532867432, |
|
"learning_rate": 2.944444444444445e-06, |
|
"loss": 0.9951, |
|
"step": 8820 |
|
}, |
|
{ |
|
"epoch": 1.7306938455507646, |
|
"grad_norm": 6.312885284423828, |
|
"learning_rate": 2.9351851851851853e-06, |
|
"loss": 1.0126, |
|
"step": 8830 |
|
}, |
|
{ |
|
"epoch": 1.7326538612308897, |
|
"grad_norm": 6.648190975189209, |
|
"learning_rate": 2.9259259259259257e-06, |
|
"loss": 1.0095, |
|
"step": 8840 |
|
}, |
|
{ |
|
"epoch": 1.7346138769110153, |
|
"grad_norm": 7.448049545288086, |
|
"learning_rate": 2.916666666666667e-06, |
|
"loss": 1.0058, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 1.7365738925911407, |
|
"grad_norm": 6.0499653816223145, |
|
"learning_rate": 2.907407407407408e-06, |
|
"loss": 1.002, |
|
"step": 8860 |
|
}, |
|
{ |
|
"epoch": 1.738533908271266, |
|
"grad_norm": 6.453138828277588, |
|
"learning_rate": 2.898148148148148e-06, |
|
"loss": 1.0107, |
|
"step": 8870 |
|
}, |
|
{ |
|
"epoch": 1.7404939239513917, |
|
"grad_norm": 6.5974555015563965, |
|
"learning_rate": 2.888888888888889e-06, |
|
"loss": 1.0119, |
|
"step": 8880 |
|
}, |
|
{ |
|
"epoch": 1.742453939631517, |
|
"grad_norm": 6.174962043762207, |
|
"learning_rate": 2.87962962962963e-06, |
|
"loss": 1.0067, |
|
"step": 8890 |
|
}, |
|
{ |
|
"epoch": 1.7444139553116425, |
|
"grad_norm": 6.343291282653809, |
|
"learning_rate": 2.8703703703703706e-06, |
|
"loss": 0.9903, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 1.746373970991768, |
|
"grad_norm": 6.71571159362793, |
|
"learning_rate": 2.861111111111111e-06, |
|
"loss": 1.018, |
|
"step": 8910 |
|
}, |
|
{ |
|
"epoch": 1.7483339866718932, |
|
"grad_norm": 6.948620319366455, |
|
"learning_rate": 2.8518518518518522e-06, |
|
"loss": 1.0138, |
|
"step": 8920 |
|
}, |
|
{ |
|
"epoch": 1.7502940023520188, |
|
"grad_norm": 6.206993579864502, |
|
"learning_rate": 2.842592592592593e-06, |
|
"loss": 1.0024, |
|
"step": 8930 |
|
}, |
|
{ |
|
"epoch": 1.7522540180321444, |
|
"grad_norm": 6.701568603515625, |
|
"learning_rate": 2.8333333333333335e-06, |
|
"loss": 1.0117, |
|
"step": 8940 |
|
}, |
|
{ |
|
"epoch": 1.7542140337122696, |
|
"grad_norm": 6.476100921630859, |
|
"learning_rate": 2.8240740740740743e-06, |
|
"loss": 1.0, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 1.7561740493923952, |
|
"grad_norm": 7.1540703773498535, |
|
"learning_rate": 2.814814814814815e-06, |
|
"loss": 0.9933, |
|
"step": 8960 |
|
}, |
|
{ |
|
"epoch": 1.7581340650725206, |
|
"grad_norm": 6.764838695526123, |
|
"learning_rate": 2.805555555555556e-06, |
|
"loss": 1.0051, |
|
"step": 8970 |
|
}, |
|
{ |
|
"epoch": 1.760094080752646, |
|
"grad_norm": 5.835758209228516, |
|
"learning_rate": 2.7962962962962963e-06, |
|
"loss": 1.0058, |
|
"step": 8980 |
|
}, |
|
{ |
|
"epoch": 1.7620540964327716, |
|
"grad_norm": 6.640206813812256, |
|
"learning_rate": 2.7870370370370375e-06, |
|
"loss": 1.0012, |
|
"step": 8990 |
|
}, |
|
{ |
|
"epoch": 1.764014112112897, |
|
"grad_norm": 7.032010555267334, |
|
"learning_rate": 2.7777777777777783e-06, |
|
"loss": 0.9988, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.7659741277930223, |
|
"grad_norm": 6.398713111877441, |
|
"learning_rate": 2.7685185185185187e-06, |
|
"loss": 1.0179, |
|
"step": 9010 |
|
}, |
|
{ |
|
"epoch": 1.767934143473148, |
|
"grad_norm": 6.625948905944824, |
|
"learning_rate": 2.759259259259259e-06, |
|
"loss": 1.0089, |
|
"step": 9020 |
|
}, |
|
{ |
|
"epoch": 1.769894159153273, |
|
"grad_norm": 5.858684062957764, |
|
"learning_rate": 2.7500000000000004e-06, |
|
"loss": 1.0057, |
|
"step": 9030 |
|
}, |
|
{ |
|
"epoch": 1.7718541748333987, |
|
"grad_norm": 7.03538703918457, |
|
"learning_rate": 2.740740740740741e-06, |
|
"loss": 1.0223, |
|
"step": 9040 |
|
}, |
|
{ |
|
"epoch": 1.773814190513524, |
|
"grad_norm": 6.332048416137695, |
|
"learning_rate": 2.7314814814814816e-06, |
|
"loss": 1.0027, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 1.7757742061936495, |
|
"grad_norm": 6.304274082183838, |
|
"learning_rate": 2.7222222222222224e-06, |
|
"loss": 0.994, |
|
"step": 9060 |
|
}, |
|
{ |
|
"epoch": 1.777734221873775, |
|
"grad_norm": 5.588042736053467, |
|
"learning_rate": 2.712962962962963e-06, |
|
"loss": 1.0044, |
|
"step": 9070 |
|
}, |
|
{ |
|
"epoch": 1.7796942375539004, |
|
"grad_norm": 6.531040191650391, |
|
"learning_rate": 2.703703703703704e-06, |
|
"loss": 1.0059, |
|
"step": 9080 |
|
}, |
|
{ |
|
"epoch": 1.7816542532340258, |
|
"grad_norm": 6.511843681335449, |
|
"learning_rate": 2.6944444444444444e-06, |
|
"loss": 1.0046, |
|
"step": 9090 |
|
}, |
|
{ |
|
"epoch": 1.7836142689141514, |
|
"grad_norm": 6.718422889709473, |
|
"learning_rate": 2.6851851851851856e-06, |
|
"loss": 1.0084, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 1.7836142689141514, |
|
"eval_loss": 1.0242658853530884, |
|
"eval_runtime": 14.0168, |
|
"eval_samples_per_second": 46.658, |
|
"eval_steps_per_second": 5.85, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 1.7855742845942768, |
|
"grad_norm": 5.511220455169678, |
|
"learning_rate": 2.6759259259259264e-06, |
|
"loss": 1.0028, |
|
"step": 9110 |
|
}, |
|
{ |
|
"epoch": 1.7875343002744022, |
|
"grad_norm": 6.125180244445801, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 0.9937, |
|
"step": 9120 |
|
}, |
|
{ |
|
"epoch": 1.7894943159545278, |
|
"grad_norm": 6.439998626708984, |
|
"learning_rate": 2.6574074074074076e-06, |
|
"loss": 0.9946, |
|
"step": 9130 |
|
}, |
|
{ |
|
"epoch": 1.791454331634653, |
|
"grad_norm": 6.341091156005859, |
|
"learning_rate": 2.6481481481481485e-06, |
|
"loss": 1.0131, |
|
"step": 9140 |
|
}, |
|
{ |
|
"epoch": 1.7934143473147786, |
|
"grad_norm": 6.909265041351318, |
|
"learning_rate": 2.6388888888888893e-06, |
|
"loss": 1.019, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 1.795374362994904, |
|
"grad_norm": 5.798973083496094, |
|
"learning_rate": 2.6296296296296297e-06, |
|
"loss": 1.0058, |
|
"step": 9160 |
|
}, |
|
{ |
|
"epoch": 1.7973343786750293, |
|
"grad_norm": 6.564090728759766, |
|
"learning_rate": 2.6203703703703705e-06, |
|
"loss": 1.0012, |
|
"step": 9170 |
|
}, |
|
{ |
|
"epoch": 1.799294394355155, |
|
"grad_norm": 6.725678443908691, |
|
"learning_rate": 2.6111111111111113e-06, |
|
"loss": 0.9987, |
|
"step": 9180 |
|
}, |
|
{ |
|
"epoch": 1.8012544100352803, |
|
"grad_norm": 6.482659339904785, |
|
"learning_rate": 2.601851851851852e-06, |
|
"loss": 0.9986, |
|
"step": 9190 |
|
}, |
|
{ |
|
"epoch": 1.8032144257154057, |
|
"grad_norm": 10.73702621459961, |
|
"learning_rate": 2.5925925925925925e-06, |
|
"loss": 0.9968, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 1.8051744413955313, |
|
"grad_norm": 6.893209457397461, |
|
"learning_rate": 2.5833333333333337e-06, |
|
"loss": 1.0111, |
|
"step": 9210 |
|
}, |
|
{ |
|
"epoch": 1.8071344570756565, |
|
"grad_norm": 6.212709426879883, |
|
"learning_rate": 2.5740740740740745e-06, |
|
"loss": 1.0036, |
|
"step": 9220 |
|
}, |
|
{ |
|
"epoch": 1.809094472755782, |
|
"grad_norm": 7.134426116943359, |
|
"learning_rate": 2.564814814814815e-06, |
|
"loss": 1.0008, |
|
"step": 9230 |
|
}, |
|
{ |
|
"epoch": 1.8110544884359074, |
|
"grad_norm": 12.403793334960938, |
|
"learning_rate": 2.5555555555555557e-06, |
|
"loss": 0.9933, |
|
"step": 9240 |
|
}, |
|
{ |
|
"epoch": 1.8130145041160328, |
|
"grad_norm": 6.31683349609375, |
|
"learning_rate": 2.5462962962962966e-06, |
|
"loss": 1.0083, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 1.8149745197961584, |
|
"grad_norm": 5.857621669769287, |
|
"learning_rate": 2.5370370370370374e-06, |
|
"loss": 0.993, |
|
"step": 9260 |
|
}, |
|
{ |
|
"epoch": 1.8169345354762838, |
|
"grad_norm": 5.9981584548950195, |
|
"learning_rate": 2.5277777777777778e-06, |
|
"loss": 1.008, |
|
"step": 9270 |
|
}, |
|
{ |
|
"epoch": 1.8188945511564092, |
|
"grad_norm": 6.6797404289245605, |
|
"learning_rate": 2.5185185185185186e-06, |
|
"loss": 0.9915, |
|
"step": 9280 |
|
}, |
|
{ |
|
"epoch": 1.8208545668365348, |
|
"grad_norm": 6.050951957702637, |
|
"learning_rate": 2.50925925925926e-06, |
|
"loss": 1.0248, |
|
"step": 9290 |
|
}, |
|
{ |
|
"epoch": 1.8228145825166602, |
|
"grad_norm": 6.692429542541504, |
|
"learning_rate": 2.5e-06, |
|
"loss": 1.0103, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 1.8247745981967856, |
|
"grad_norm": 6.396121978759766, |
|
"learning_rate": 2.490740740740741e-06, |
|
"loss": 1.0026, |
|
"step": 9310 |
|
}, |
|
{ |
|
"epoch": 1.8267346138769112, |
|
"grad_norm": 6.65963077545166, |
|
"learning_rate": 2.481481481481482e-06, |
|
"loss": 0.9912, |
|
"step": 9320 |
|
}, |
|
{ |
|
"epoch": 1.8286946295570363, |
|
"grad_norm": 7.833820343017578, |
|
"learning_rate": 2.4722222222222226e-06, |
|
"loss": 0.994, |
|
"step": 9330 |
|
}, |
|
{ |
|
"epoch": 1.830654645237162, |
|
"grad_norm": 5.663256645202637, |
|
"learning_rate": 2.462962962962963e-06, |
|
"loss": 1.0014, |
|
"step": 9340 |
|
}, |
|
{ |
|
"epoch": 1.8326146609172873, |
|
"grad_norm": 6.487533092498779, |
|
"learning_rate": 2.453703703703704e-06, |
|
"loss": 1.0022, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 1.8345746765974127, |
|
"grad_norm": 6.143520832061768, |
|
"learning_rate": 2.4444444444444447e-06, |
|
"loss": 0.9852, |
|
"step": 9360 |
|
}, |
|
{ |
|
"epoch": 1.8365346922775383, |
|
"grad_norm": 6.246576309204102, |
|
"learning_rate": 2.4351851851851855e-06, |
|
"loss": 0.9995, |
|
"step": 9370 |
|
}, |
|
{ |
|
"epoch": 1.8384947079576637, |
|
"grad_norm": 6.709537982940674, |
|
"learning_rate": 2.425925925925926e-06, |
|
"loss": 1.0039, |
|
"step": 9380 |
|
}, |
|
{ |
|
"epoch": 1.840454723637789, |
|
"grad_norm": 6.031428813934326, |
|
"learning_rate": 2.4166666666666667e-06, |
|
"loss": 1.0103, |
|
"step": 9390 |
|
}, |
|
{ |
|
"epoch": 1.8424147393179147, |
|
"grad_norm": 6.703726291656494, |
|
"learning_rate": 2.4074074074074075e-06, |
|
"loss": 0.9999, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 1.8443747549980398, |
|
"grad_norm": 6.8644819259643555, |
|
"learning_rate": 2.3981481481481483e-06, |
|
"loss": 0.9994, |
|
"step": 9410 |
|
}, |
|
{ |
|
"epoch": 1.8463347706781654, |
|
"grad_norm": 7.306818962097168, |
|
"learning_rate": 2.388888888888889e-06, |
|
"loss": 0.9995, |
|
"step": 9420 |
|
}, |
|
{ |
|
"epoch": 1.848294786358291, |
|
"grad_norm": 5.757288932800293, |
|
"learning_rate": 2.37962962962963e-06, |
|
"loss": 0.9883, |
|
"step": 9430 |
|
}, |
|
{ |
|
"epoch": 1.8502548020384162, |
|
"grad_norm": 6.339224815368652, |
|
"learning_rate": 2.3703703703703707e-06, |
|
"loss": 0.9953, |
|
"step": 9440 |
|
}, |
|
{ |
|
"epoch": 1.8522148177185418, |
|
"grad_norm": 8.186257362365723, |
|
"learning_rate": 2.361111111111111e-06, |
|
"loss": 0.9999, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 1.8541748333986672, |
|
"grad_norm": 6.2635111808776855, |
|
"learning_rate": 2.351851851851852e-06, |
|
"loss": 1.0047, |
|
"step": 9460 |
|
}, |
|
{ |
|
"epoch": 1.8561348490787926, |
|
"grad_norm": 6.483547210693359, |
|
"learning_rate": 2.3425925925925928e-06, |
|
"loss": 1.0003, |
|
"step": 9470 |
|
}, |
|
{ |
|
"epoch": 1.8580948647589182, |
|
"grad_norm": 6.6517333984375, |
|
"learning_rate": 2.3333333333333336e-06, |
|
"loss": 1.0152, |
|
"step": 9480 |
|
}, |
|
{ |
|
"epoch": 1.8600548804390435, |
|
"grad_norm": 6.463938236236572, |
|
"learning_rate": 2.324074074074074e-06, |
|
"loss": 0.9917, |
|
"step": 9490 |
|
}, |
|
{ |
|
"epoch": 1.862014896119169, |
|
"grad_norm": 6.184695243835449, |
|
"learning_rate": 2.314814814814815e-06, |
|
"loss": 0.9889, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.8639749117992945, |
|
"grad_norm": 6.435100078582764, |
|
"learning_rate": 2.305555555555556e-06, |
|
"loss": 1.002, |
|
"step": 9510 |
|
}, |
|
{ |
|
"epoch": 1.8659349274794197, |
|
"grad_norm": 5.961505889892578, |
|
"learning_rate": 2.2962962962962964e-06, |
|
"loss": 0.9931, |
|
"step": 9520 |
|
}, |
|
{ |
|
"epoch": 1.8678949431595453, |
|
"grad_norm": 6.590498924255371, |
|
"learning_rate": 2.2870370370370372e-06, |
|
"loss": 0.9967, |
|
"step": 9530 |
|
}, |
|
{ |
|
"epoch": 1.8698549588396707, |
|
"grad_norm": 6.019999027252197, |
|
"learning_rate": 2.277777777777778e-06, |
|
"loss": 0.9887, |
|
"step": 9540 |
|
}, |
|
{ |
|
"epoch": 1.871814974519796, |
|
"grad_norm": 6.332427978515625, |
|
"learning_rate": 2.268518518518519e-06, |
|
"loss": 1.0014, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 1.8737749901999217, |
|
"grad_norm": 6.301807880401611, |
|
"learning_rate": 2.2592592592592592e-06, |
|
"loss": 0.9969, |
|
"step": 9560 |
|
}, |
|
{ |
|
"epoch": 1.875735005880047, |
|
"grad_norm": 6.279837131500244, |
|
"learning_rate": 2.25e-06, |
|
"loss": 0.9944, |
|
"step": 9570 |
|
}, |
|
{ |
|
"epoch": 1.8776950215601724, |
|
"grad_norm": 7.422451496124268, |
|
"learning_rate": 2.240740740740741e-06, |
|
"loss": 1.0007, |
|
"step": 9580 |
|
}, |
|
{ |
|
"epoch": 1.879655037240298, |
|
"grad_norm": 6.722255229949951, |
|
"learning_rate": 2.2314814814814817e-06, |
|
"loss": 0.9985, |
|
"step": 9590 |
|
}, |
|
{ |
|
"epoch": 1.8816150529204234, |
|
"grad_norm": 6.471696853637695, |
|
"learning_rate": 2.222222222222222e-06, |
|
"loss": 0.9984, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 1.8835750686005488, |
|
"grad_norm": 6.347057819366455, |
|
"learning_rate": 2.2129629629629633e-06, |
|
"loss": 1.0092, |
|
"step": 9610 |
|
}, |
|
{ |
|
"epoch": 1.8855350842806744, |
|
"grad_norm": 6.340747833251953, |
|
"learning_rate": 2.203703703703704e-06, |
|
"loss": 0.9978, |
|
"step": 9620 |
|
}, |
|
{ |
|
"epoch": 1.8874950999607996, |
|
"grad_norm": 5.975046157836914, |
|
"learning_rate": 2.1944444444444445e-06, |
|
"loss": 1.0006, |
|
"step": 9630 |
|
}, |
|
{ |
|
"epoch": 1.8894551156409252, |
|
"grad_norm": 6.538125038146973, |
|
"learning_rate": 2.1851851851851853e-06, |
|
"loss": 1.006, |
|
"step": 9640 |
|
}, |
|
{ |
|
"epoch": 1.8914151313210505, |
|
"grad_norm": 6.345036506652832, |
|
"learning_rate": 2.175925925925926e-06, |
|
"loss": 1.0106, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 1.893375147001176, |
|
"grad_norm": 6.604660987854004, |
|
"learning_rate": 2.166666666666667e-06, |
|
"loss": 0.9964, |
|
"step": 9660 |
|
}, |
|
{ |
|
"epoch": 1.8953351626813015, |
|
"grad_norm": 6.32635498046875, |
|
"learning_rate": 2.1574074074074073e-06, |
|
"loss": 1.0108, |
|
"step": 9670 |
|
}, |
|
{ |
|
"epoch": 1.897295178361427, |
|
"grad_norm": 6.804265975952148, |
|
"learning_rate": 2.148148148148148e-06, |
|
"loss": 1.0053, |
|
"step": 9680 |
|
}, |
|
{ |
|
"epoch": 1.8992551940415523, |
|
"grad_norm": 6.880146503448486, |
|
"learning_rate": 2.138888888888889e-06, |
|
"loss": 1.0073, |
|
"step": 9690 |
|
}, |
|
{ |
|
"epoch": 1.901215209721678, |
|
"grad_norm": 7.2419538497924805, |
|
"learning_rate": 2.1296296296296298e-06, |
|
"loss": 1.004, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 1.903175225401803, |
|
"grad_norm": 6.319647789001465, |
|
"learning_rate": 2.1203703703703706e-06, |
|
"loss": 1.0049, |
|
"step": 9710 |
|
}, |
|
{ |
|
"epoch": 1.9051352410819287, |
|
"grad_norm": 6.6896491050720215, |
|
"learning_rate": 2.1111111111111114e-06, |
|
"loss": 1.0007, |
|
"step": 9720 |
|
}, |
|
{ |
|
"epoch": 1.907095256762054, |
|
"grad_norm": 5.643420696258545, |
|
"learning_rate": 2.1018518518518522e-06, |
|
"loss": 0.9898, |
|
"step": 9730 |
|
}, |
|
{ |
|
"epoch": 1.9090552724421794, |
|
"grad_norm": 6.660647869110107, |
|
"learning_rate": 2.0925925925925926e-06, |
|
"loss": 1.0039, |
|
"step": 9740 |
|
}, |
|
{ |
|
"epoch": 1.911015288122305, |
|
"grad_norm": 7.0795464515686035, |
|
"learning_rate": 2.0833333333333334e-06, |
|
"loss": 0.986, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 1.9129753038024304, |
|
"grad_norm": 6.72257137298584, |
|
"learning_rate": 2.0740740740740742e-06, |
|
"loss": 1.0045, |
|
"step": 9760 |
|
}, |
|
{ |
|
"epoch": 1.9149353194825558, |
|
"grad_norm": 7.132964134216309, |
|
"learning_rate": 2.064814814814815e-06, |
|
"loss": 1.0043, |
|
"step": 9770 |
|
}, |
|
{ |
|
"epoch": 1.9168953351626814, |
|
"grad_norm": 6.032354354858398, |
|
"learning_rate": 2.0555555555555555e-06, |
|
"loss": 1.0159, |
|
"step": 9780 |
|
}, |
|
{ |
|
"epoch": 1.9188553508428068, |
|
"grad_norm": 6.651278495788574, |
|
"learning_rate": 2.0462962962962967e-06, |
|
"loss": 1.0009, |
|
"step": 9790 |
|
}, |
|
{ |
|
"epoch": 1.9208153665229322, |
|
"grad_norm": 6.432239532470703, |
|
"learning_rate": 2.037037037037037e-06, |
|
"loss": 1.0035, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 1.9208153665229322, |
|
"eval_loss": 1.0201009511947632, |
|
"eval_runtime": 14.0626, |
|
"eval_samples_per_second": 46.506, |
|
"eval_steps_per_second": 5.831, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 1.9227753822030578, |
|
"grad_norm": 6.820202827453613, |
|
"learning_rate": 2.027777777777778e-06, |
|
"loss": 0.9879, |
|
"step": 9810 |
|
}, |
|
{ |
|
"epoch": 1.924735397883183, |
|
"grad_norm": 6.324273586273193, |
|
"learning_rate": 2.0185185185185187e-06, |
|
"loss": 1.0022, |
|
"step": 9820 |
|
}, |
|
{ |
|
"epoch": 1.9266954135633085, |
|
"grad_norm": 5.975357532501221, |
|
"learning_rate": 2.0092592592592595e-06, |
|
"loss": 1.0168, |
|
"step": 9830 |
|
}, |
|
{ |
|
"epoch": 1.928655429243434, |
|
"grad_norm": 7.312973976135254, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.0017, |
|
"step": 9840 |
|
}, |
|
{ |
|
"epoch": 1.9306154449235593, |
|
"grad_norm": 7.051765441894531, |
|
"learning_rate": 1.9907407407407407e-06, |
|
"loss": 0.992, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 1.932575460603685, |
|
"grad_norm": 6.269126892089844, |
|
"learning_rate": 1.9814814814814815e-06, |
|
"loss": 1.0018, |
|
"step": 9860 |
|
}, |
|
{ |
|
"epoch": 1.9345354762838103, |
|
"grad_norm": 6.204473972320557, |
|
"learning_rate": 1.9722222222222224e-06, |
|
"loss": 0.999, |
|
"step": 9870 |
|
}, |
|
{ |
|
"epoch": 1.9364954919639357, |
|
"grad_norm": 6.758467197418213, |
|
"learning_rate": 1.962962962962963e-06, |
|
"loss": 1.0023, |
|
"step": 9880 |
|
}, |
|
{ |
|
"epoch": 1.9384555076440613, |
|
"grad_norm": 6.538867950439453, |
|
"learning_rate": 1.953703703703704e-06, |
|
"loss": 0.9907, |
|
"step": 9890 |
|
}, |
|
{ |
|
"epoch": 1.9404155233241864, |
|
"grad_norm": 7.003431797027588, |
|
"learning_rate": 1.944444444444445e-06, |
|
"loss": 1.0047, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 1.942375539004312, |
|
"grad_norm": 6.028417587280273, |
|
"learning_rate": 1.935185185185185e-06, |
|
"loss": 1.0027, |
|
"step": 9910 |
|
}, |
|
{ |
|
"epoch": 1.9443355546844376, |
|
"grad_norm": 6.356867790222168, |
|
"learning_rate": 1.925925925925926e-06, |
|
"loss": 0.9902, |
|
"step": 9920 |
|
}, |
|
{ |
|
"epoch": 1.9462955703645628, |
|
"grad_norm": 7.202489376068115, |
|
"learning_rate": 1.916666666666667e-06, |
|
"loss": 1.0106, |
|
"step": 9930 |
|
}, |
|
{ |
|
"epoch": 1.9482555860446884, |
|
"grad_norm": 6.344156265258789, |
|
"learning_rate": 1.9074074074074076e-06, |
|
"loss": 0.9916, |
|
"step": 9940 |
|
}, |
|
{ |
|
"epoch": 1.9502156017248138, |
|
"grad_norm": 6.817245006561279, |
|
"learning_rate": 1.8981481481481484e-06, |
|
"loss": 0.9829, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 1.9521756174049392, |
|
"grad_norm": 6.988001823425293, |
|
"learning_rate": 1.888888888888889e-06, |
|
"loss": 0.9997, |
|
"step": 9960 |
|
}, |
|
{ |
|
"epoch": 1.9541356330850648, |
|
"grad_norm": 6.65484619140625, |
|
"learning_rate": 1.8796296296296299e-06, |
|
"loss": 1.0026, |
|
"step": 9970 |
|
}, |
|
{ |
|
"epoch": 1.9560956487651902, |
|
"grad_norm": 6.717133045196533, |
|
"learning_rate": 1.8703703703703705e-06, |
|
"loss": 1.0071, |
|
"step": 9980 |
|
}, |
|
{ |
|
"epoch": 1.9580556644453155, |
|
"grad_norm": 6.037536144256592, |
|
"learning_rate": 1.8611111111111113e-06, |
|
"loss": 0.993, |
|
"step": 9990 |
|
}, |
|
{ |
|
"epoch": 1.9600156801254411, |
|
"grad_norm": 6.965274810791016, |
|
"learning_rate": 1.8518518518518519e-06, |
|
"loss": 0.9918, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.9619756958055663, |
|
"grad_norm": 6.768052101135254, |
|
"learning_rate": 1.8425925925925927e-06, |
|
"loss": 0.9873, |
|
"step": 10010 |
|
}, |
|
{ |
|
"epoch": 1.963935711485692, |
|
"grad_norm": 6.111681938171387, |
|
"learning_rate": 1.8333333333333333e-06, |
|
"loss": 1.0111, |
|
"step": 10020 |
|
}, |
|
{ |
|
"epoch": 1.9658957271658173, |
|
"grad_norm": 6.606689453125, |
|
"learning_rate": 1.8240740740740743e-06, |
|
"loss": 0.9997, |
|
"step": 10030 |
|
}, |
|
{ |
|
"epoch": 1.9678557428459427, |
|
"grad_norm": 6.256832599639893, |
|
"learning_rate": 1.8148148148148151e-06, |
|
"loss": 0.9977, |
|
"step": 10040 |
|
}, |
|
{ |
|
"epoch": 1.9698157585260683, |
|
"grad_norm": 6.401496410369873, |
|
"learning_rate": 1.8055555555555557e-06, |
|
"loss": 1.0069, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 1.9717757742061937, |
|
"grad_norm": 6.166873455047607, |
|
"learning_rate": 1.7962962962962965e-06, |
|
"loss": 0.9998, |
|
"step": 10060 |
|
}, |
|
{ |
|
"epoch": 1.973735789886319, |
|
"grad_norm": 5.77340030670166, |
|
"learning_rate": 1.7870370370370371e-06, |
|
"loss": 1.0008, |
|
"step": 10070 |
|
}, |
|
{ |
|
"epoch": 1.9756958055664446, |
|
"grad_norm": 7.6040120124816895, |
|
"learning_rate": 1.777777777777778e-06, |
|
"loss": 0.9991, |
|
"step": 10080 |
|
}, |
|
{ |
|
"epoch": 1.97765582124657, |
|
"grad_norm": 6.114837169647217, |
|
"learning_rate": 1.7685185185185186e-06, |
|
"loss": 0.9949, |
|
"step": 10090 |
|
}, |
|
{ |
|
"epoch": 1.9796158369266954, |
|
"grad_norm": 6.565464496612549, |
|
"learning_rate": 1.7592592592592594e-06, |
|
"loss": 1.0141, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 1.981575852606821, |
|
"grad_norm": 6.436923980712891, |
|
"learning_rate": 1.75e-06, |
|
"loss": 0.9975, |
|
"step": 10110 |
|
}, |
|
{ |
|
"epoch": 1.9835358682869462, |
|
"grad_norm": 7.330633640289307, |
|
"learning_rate": 1.740740740740741e-06, |
|
"loss": 1.0014, |
|
"step": 10120 |
|
}, |
|
{ |
|
"epoch": 1.9854958839670718, |
|
"grad_norm": 7.172529697418213, |
|
"learning_rate": 1.7314814814814814e-06, |
|
"loss": 0.9811, |
|
"step": 10130 |
|
}, |
|
{ |
|
"epoch": 1.9874558996471972, |
|
"grad_norm": 6.164400577545166, |
|
"learning_rate": 1.7222222222222224e-06, |
|
"loss": 0.9972, |
|
"step": 10140 |
|
}, |
|
{ |
|
"epoch": 1.9894159153273225, |
|
"grad_norm": 6.392111778259277, |
|
"learning_rate": 1.7129629629629632e-06, |
|
"loss": 1.0001, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 1.9913759310074481, |
|
"grad_norm": 8.478386878967285, |
|
"learning_rate": 1.7037037037037038e-06, |
|
"loss": 1.0015, |
|
"step": 10160 |
|
}, |
|
{ |
|
"epoch": 1.9933359466875735, |
|
"grad_norm": 6.89587926864624, |
|
"learning_rate": 1.6944444444444446e-06, |
|
"loss": 1.0047, |
|
"step": 10170 |
|
}, |
|
{ |
|
"epoch": 1.995295962367699, |
|
"grad_norm": 6.686724662780762, |
|
"learning_rate": 1.6851851851851852e-06, |
|
"loss": 0.9953, |
|
"step": 10180 |
|
}, |
|
{ |
|
"epoch": 1.9972559780478245, |
|
"grad_norm": 6.857761859893799, |
|
"learning_rate": 1.675925925925926e-06, |
|
"loss": 0.9836, |
|
"step": 10190 |
|
}, |
|
{ |
|
"epoch": 1.9992159937279497, |
|
"grad_norm": 6.607189178466797, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 0.9979, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 2.0011760094080753, |
|
"grad_norm": 6.168262481689453, |
|
"learning_rate": 1.6574074074074075e-06, |
|
"loss": 0.9719, |
|
"step": 10210 |
|
}, |
|
{ |
|
"epoch": 2.003136025088201, |
|
"grad_norm": 6.525495529174805, |
|
"learning_rate": 1.648148148148148e-06, |
|
"loss": 0.9789, |
|
"step": 10220 |
|
}, |
|
{ |
|
"epoch": 2.005096040768326, |
|
"grad_norm": 5.562708854675293, |
|
"learning_rate": 1.638888888888889e-06, |
|
"loss": 0.98, |
|
"step": 10230 |
|
}, |
|
{ |
|
"epoch": 2.0070560564484516, |
|
"grad_norm": 7.023379802703857, |
|
"learning_rate": 1.62962962962963e-06, |
|
"loss": 0.985, |
|
"step": 10240 |
|
}, |
|
{ |
|
"epoch": 2.0090160721285772, |
|
"grad_norm": 7.55203104019165, |
|
"learning_rate": 1.6203703703703705e-06, |
|
"loss": 0.9758, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 2.0109760878087024, |
|
"grad_norm": 13.453210830688477, |
|
"learning_rate": 1.6111111111111113e-06, |
|
"loss": 0.9656, |
|
"step": 10260 |
|
}, |
|
{ |
|
"epoch": 2.012936103488828, |
|
"grad_norm": 6.530963897705078, |
|
"learning_rate": 1.601851851851852e-06, |
|
"loss": 0.9615, |
|
"step": 10270 |
|
}, |
|
{ |
|
"epoch": 2.014896119168953, |
|
"grad_norm": 6.9765520095825195, |
|
"learning_rate": 1.5925925925925927e-06, |
|
"loss": 0.9786, |
|
"step": 10280 |
|
}, |
|
{ |
|
"epoch": 2.0168561348490788, |
|
"grad_norm": 7.124659538269043, |
|
"learning_rate": 1.5833333333333333e-06, |
|
"loss": 0.9795, |
|
"step": 10290 |
|
}, |
|
{ |
|
"epoch": 2.0188161505292044, |
|
"grad_norm": 7.119600772857666, |
|
"learning_rate": 1.5740740740740742e-06, |
|
"loss": 0.9672, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 2.0207761662093295, |
|
"grad_norm": 6.711264133453369, |
|
"learning_rate": 1.5648148148148148e-06, |
|
"loss": 0.965, |
|
"step": 10310 |
|
}, |
|
{ |
|
"epoch": 2.022736181889455, |
|
"grad_norm": 7.027464389801025, |
|
"learning_rate": 1.5555555555555558e-06, |
|
"loss": 0.97, |
|
"step": 10320 |
|
}, |
|
{ |
|
"epoch": 2.0246961975695807, |
|
"grad_norm": 7.815002918243408, |
|
"learning_rate": 1.5462962962962964e-06, |
|
"loss": 0.9706, |
|
"step": 10330 |
|
}, |
|
{ |
|
"epoch": 2.026656213249706, |
|
"grad_norm": 6.989712238311768, |
|
"learning_rate": 1.5370370370370372e-06, |
|
"loss": 0.9741, |
|
"step": 10340 |
|
}, |
|
{ |
|
"epoch": 2.0286162289298315, |
|
"grad_norm": 6.903376579284668, |
|
"learning_rate": 1.527777777777778e-06, |
|
"loss": 0.9707, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 2.0305762446099567, |
|
"grad_norm": 5.8991570472717285, |
|
"learning_rate": 1.5185185185185186e-06, |
|
"loss": 0.9731, |
|
"step": 10360 |
|
}, |
|
{ |
|
"epoch": 2.0325362602900823, |
|
"grad_norm": 7.208601474761963, |
|
"learning_rate": 1.5092592592592594e-06, |
|
"loss": 0.9845, |
|
"step": 10370 |
|
}, |
|
{ |
|
"epoch": 2.034496275970208, |
|
"grad_norm": 6.827620506286621, |
|
"learning_rate": 1.5e-06, |
|
"loss": 0.968, |
|
"step": 10380 |
|
}, |
|
{ |
|
"epoch": 2.036456291650333, |
|
"grad_norm": 6.6345295906066895, |
|
"learning_rate": 1.4907407407407409e-06, |
|
"loss": 0.9626, |
|
"step": 10390 |
|
}, |
|
{ |
|
"epoch": 2.0384163073304586, |
|
"grad_norm": 7.7344865798950195, |
|
"learning_rate": 1.4814814814814815e-06, |
|
"loss": 0.9717, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 2.0403763230105842, |
|
"grad_norm": 7.092048168182373, |
|
"learning_rate": 1.4722222222222225e-06, |
|
"loss": 0.9721, |
|
"step": 10410 |
|
}, |
|
{ |
|
"epoch": 2.0423363386907094, |
|
"grad_norm": 6.746728897094727, |
|
"learning_rate": 1.4629629629629629e-06, |
|
"loss": 0.9755, |
|
"step": 10420 |
|
}, |
|
{ |
|
"epoch": 2.044296354370835, |
|
"grad_norm": 7.295462131500244, |
|
"learning_rate": 1.453703703703704e-06, |
|
"loss": 0.9633, |
|
"step": 10430 |
|
}, |
|
{ |
|
"epoch": 2.0462563700509606, |
|
"grad_norm": 6.882177829742432, |
|
"learning_rate": 1.4444444444444445e-06, |
|
"loss": 0.9668, |
|
"step": 10440 |
|
}, |
|
{ |
|
"epoch": 2.0482163857310858, |
|
"grad_norm": 6.1587300300598145, |
|
"learning_rate": 1.4351851851851853e-06, |
|
"loss": 0.9773, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 2.0501764014112114, |
|
"grad_norm": 7.219092845916748, |
|
"learning_rate": 1.4259259259259261e-06, |
|
"loss": 0.9724, |
|
"step": 10460 |
|
}, |
|
{ |
|
"epoch": 2.0521364170913365, |
|
"grad_norm": 6.330183029174805, |
|
"learning_rate": 1.4166666666666667e-06, |
|
"loss": 0.965, |
|
"step": 10470 |
|
}, |
|
{ |
|
"epoch": 2.054096432771462, |
|
"grad_norm": 7.83421516418457, |
|
"learning_rate": 1.4074074074074075e-06, |
|
"loss": 0.9715, |
|
"step": 10480 |
|
}, |
|
{ |
|
"epoch": 2.0560564484515877, |
|
"grad_norm": 6.530287265777588, |
|
"learning_rate": 1.3981481481481481e-06, |
|
"loss": 0.9859, |
|
"step": 10490 |
|
}, |
|
{ |
|
"epoch": 2.058016464131713, |
|
"grad_norm": 7.4246625900268555, |
|
"learning_rate": 1.3888888888888892e-06, |
|
"loss": 0.9567, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 2.058016464131713, |
|
"eval_loss": 1.0192580223083496, |
|
"eval_runtime": 14.0156, |
|
"eval_samples_per_second": 46.662, |
|
"eval_steps_per_second": 5.851, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 2.0599764798118385, |
|
"grad_norm": 7.052817344665527, |
|
"learning_rate": 1.3796296296296296e-06, |
|
"loss": 0.9657, |
|
"step": 10510 |
|
}, |
|
{ |
|
"epoch": 2.061936495491964, |
|
"grad_norm": 7.35888147354126, |
|
"learning_rate": 1.3703703703703706e-06, |
|
"loss": 0.9702, |
|
"step": 10520 |
|
}, |
|
{ |
|
"epoch": 2.0638965111720893, |
|
"grad_norm": 6.373018264770508, |
|
"learning_rate": 1.3611111111111112e-06, |
|
"loss": 0.9753, |
|
"step": 10530 |
|
}, |
|
{ |
|
"epoch": 2.065856526852215, |
|
"grad_norm": 6.720044136047363, |
|
"learning_rate": 1.351851851851852e-06, |
|
"loss": 0.9708, |
|
"step": 10540 |
|
}, |
|
{ |
|
"epoch": 2.0678165425323405, |
|
"grad_norm": 6.653097152709961, |
|
"learning_rate": 1.3425925925925928e-06, |
|
"loss": 0.9702, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 2.0697765582124656, |
|
"grad_norm": 7.028474807739258, |
|
"learning_rate": 1.3333333333333334e-06, |
|
"loss": 0.9787, |
|
"step": 10560 |
|
}, |
|
{ |
|
"epoch": 2.0717365738925912, |
|
"grad_norm": 7.457616329193115, |
|
"learning_rate": 1.3240740740740742e-06, |
|
"loss": 0.9585, |
|
"step": 10570 |
|
}, |
|
{ |
|
"epoch": 2.0736965895727164, |
|
"grad_norm": 6.340860843658447, |
|
"learning_rate": 1.3148148148148148e-06, |
|
"loss": 0.9592, |
|
"step": 10580 |
|
}, |
|
{ |
|
"epoch": 2.075656605252842, |
|
"grad_norm": 7.086592674255371, |
|
"learning_rate": 1.3055555555555556e-06, |
|
"loss": 0.9701, |
|
"step": 10590 |
|
}, |
|
{ |
|
"epoch": 2.0776166209329676, |
|
"grad_norm": 7.2209320068359375, |
|
"learning_rate": 1.2962962962962962e-06, |
|
"loss": 0.9655, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 2.0795766366130928, |
|
"grad_norm": 6.678873062133789, |
|
"learning_rate": 1.2870370370370373e-06, |
|
"loss": 0.9674, |
|
"step": 10610 |
|
}, |
|
{ |
|
"epoch": 2.0815366522932184, |
|
"grad_norm": 7.573465347290039, |
|
"learning_rate": 1.2777777777777779e-06, |
|
"loss": 0.9754, |
|
"step": 10620 |
|
}, |
|
{ |
|
"epoch": 2.083496667973344, |
|
"grad_norm": 7.289187908172607, |
|
"learning_rate": 1.2685185185185187e-06, |
|
"loss": 0.9769, |
|
"step": 10630 |
|
}, |
|
{ |
|
"epoch": 2.085456683653469, |
|
"grad_norm": 6.930860996246338, |
|
"learning_rate": 1.2592592592592593e-06, |
|
"loss": 0.9697, |
|
"step": 10640 |
|
}, |
|
{ |
|
"epoch": 2.0874166993335947, |
|
"grad_norm": 6.5301618576049805, |
|
"learning_rate": 1.25e-06, |
|
"loss": 0.9766, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 2.08937671501372, |
|
"grad_norm": 8.65379524230957, |
|
"learning_rate": 1.240740740740741e-06, |
|
"loss": 0.9545, |
|
"step": 10660 |
|
}, |
|
{ |
|
"epoch": 2.0913367306938455, |
|
"grad_norm": 6.923322677612305, |
|
"learning_rate": 1.2314814814814815e-06, |
|
"loss": 0.9643, |
|
"step": 10670 |
|
}, |
|
{ |
|
"epoch": 2.093296746373971, |
|
"grad_norm": 7.2696614265441895, |
|
"learning_rate": 1.2222222222222223e-06, |
|
"loss": 0.9677, |
|
"step": 10680 |
|
}, |
|
{ |
|
"epoch": 2.0952567620540963, |
|
"grad_norm": 7.1026835441589355, |
|
"learning_rate": 1.212962962962963e-06, |
|
"loss": 0.9791, |
|
"step": 10690 |
|
}, |
|
{ |
|
"epoch": 2.097216777734222, |
|
"grad_norm": 6.935880661010742, |
|
"learning_rate": 1.2037037037037037e-06, |
|
"loss": 0.9639, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 2.0991767934143475, |
|
"grad_norm": 6.7199201583862305, |
|
"learning_rate": 1.1944444444444446e-06, |
|
"loss": 0.9795, |
|
"step": 10710 |
|
}, |
|
{ |
|
"epoch": 2.1011368090944726, |
|
"grad_norm": 7.126111030578613, |
|
"learning_rate": 1.1851851851851854e-06, |
|
"loss": 0.9604, |
|
"step": 10720 |
|
}, |
|
{ |
|
"epoch": 2.1030968247745983, |
|
"grad_norm": 7.274319171905518, |
|
"learning_rate": 1.175925925925926e-06, |
|
"loss": 0.9696, |
|
"step": 10730 |
|
}, |
|
{ |
|
"epoch": 2.105056840454724, |
|
"grad_norm": 7.07975959777832, |
|
"learning_rate": 1.1666666666666668e-06, |
|
"loss": 0.9708, |
|
"step": 10740 |
|
}, |
|
{ |
|
"epoch": 2.107016856134849, |
|
"grad_norm": 24.05130386352539, |
|
"learning_rate": 1.1574074074074076e-06, |
|
"loss": 0.9765, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 2.1089768718149746, |
|
"grad_norm": 7.428647518157959, |
|
"learning_rate": 1.1481481481481482e-06, |
|
"loss": 0.9665, |
|
"step": 10760 |
|
}, |
|
{ |
|
"epoch": 2.1109368874951, |
|
"grad_norm": 6.800421714782715, |
|
"learning_rate": 1.138888888888889e-06, |
|
"loss": 0.9687, |
|
"step": 10770 |
|
}, |
|
{ |
|
"epoch": 2.1128969031752254, |
|
"grad_norm": 8.014981269836426, |
|
"learning_rate": 1.1296296296296296e-06, |
|
"loss": 0.9773, |
|
"step": 10780 |
|
}, |
|
{ |
|
"epoch": 2.114856918855351, |
|
"grad_norm": 7.821423053741455, |
|
"learning_rate": 1.1203703703703704e-06, |
|
"loss": 0.9694, |
|
"step": 10790 |
|
}, |
|
{ |
|
"epoch": 2.116816934535476, |
|
"grad_norm": 8.069396018981934, |
|
"learning_rate": 1.111111111111111e-06, |
|
"loss": 0.9741, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 2.1187769502156018, |
|
"grad_norm": 7.268844127655029, |
|
"learning_rate": 1.101851851851852e-06, |
|
"loss": 0.9685, |
|
"step": 10810 |
|
}, |
|
{ |
|
"epoch": 2.1207369658957274, |
|
"grad_norm": 7.122548580169678, |
|
"learning_rate": 1.0925925925925927e-06, |
|
"loss": 0.9491, |
|
"step": 10820 |
|
}, |
|
{ |
|
"epoch": 2.1226969815758525, |
|
"grad_norm": 6.834280014038086, |
|
"learning_rate": 1.0833333333333335e-06, |
|
"loss": 0.9836, |
|
"step": 10830 |
|
}, |
|
{ |
|
"epoch": 2.124656997255978, |
|
"grad_norm": 6.231177806854248, |
|
"learning_rate": 1.074074074074074e-06, |
|
"loss": 0.9729, |
|
"step": 10840 |
|
}, |
|
{ |
|
"epoch": 2.1266170129361033, |
|
"grad_norm": 7.387545585632324, |
|
"learning_rate": 1.0648148148148149e-06, |
|
"loss": 0.9658, |
|
"step": 10850 |
|
}, |
|
{ |
|
"epoch": 2.128577028616229, |
|
"grad_norm": 8.538785934448242, |
|
"learning_rate": 1.0555555555555557e-06, |
|
"loss": 0.9521, |
|
"step": 10860 |
|
}, |
|
{ |
|
"epoch": 2.1305370442963545, |
|
"grad_norm": 7.329588890075684, |
|
"learning_rate": 1.0462962962962963e-06, |
|
"loss": 0.9804, |
|
"step": 10870 |
|
}, |
|
{ |
|
"epoch": 2.1324970599764796, |
|
"grad_norm": 6.391878128051758, |
|
"learning_rate": 1.0370370370370371e-06, |
|
"loss": 0.9684, |
|
"step": 10880 |
|
}, |
|
{ |
|
"epoch": 2.1344570756566053, |
|
"grad_norm": 7.090411186218262, |
|
"learning_rate": 1.0277777777777777e-06, |
|
"loss": 0.9622, |
|
"step": 10890 |
|
}, |
|
{ |
|
"epoch": 2.136417091336731, |
|
"grad_norm": 6.1176958084106445, |
|
"learning_rate": 1.0185185185185185e-06, |
|
"loss": 0.9696, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 2.138377107016856, |
|
"grad_norm": 10.32075023651123, |
|
"learning_rate": 1.0092592592592594e-06, |
|
"loss": 0.9686, |
|
"step": 10910 |
|
}, |
|
{ |
|
"epoch": 2.1403371226969816, |
|
"grad_norm": 7.174540996551514, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.9647, |
|
"step": 10920 |
|
}, |
|
{ |
|
"epoch": 2.1422971383771072, |
|
"grad_norm": 7.739523410797119, |
|
"learning_rate": 9.907407407407408e-07, |
|
"loss": 0.9646, |
|
"step": 10930 |
|
}, |
|
{ |
|
"epoch": 2.1442571540572324, |
|
"grad_norm": 7.084393501281738, |
|
"learning_rate": 9.814814814814816e-07, |
|
"loss": 0.9695, |
|
"step": 10940 |
|
}, |
|
{ |
|
"epoch": 2.146217169737358, |
|
"grad_norm": 7.240608215332031, |
|
"learning_rate": 9.722222222222224e-07, |
|
"loss": 0.9655, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 2.148177185417483, |
|
"grad_norm": 7.608102798461914, |
|
"learning_rate": 9.62962962962963e-07, |
|
"loss": 0.9598, |
|
"step": 10960 |
|
}, |
|
{ |
|
"epoch": 2.1501372010976088, |
|
"grad_norm": 7.160534858703613, |
|
"learning_rate": 9.537037037037038e-07, |
|
"loss": 0.9649, |
|
"step": 10970 |
|
}, |
|
{ |
|
"epoch": 2.1520972167777344, |
|
"grad_norm": 7.145265102386475, |
|
"learning_rate": 9.444444444444445e-07, |
|
"loss": 0.9646, |
|
"step": 10980 |
|
}, |
|
{ |
|
"epoch": 2.1540572324578595, |
|
"grad_norm": 25.257863998413086, |
|
"learning_rate": 9.351851851851852e-07, |
|
"loss": 0.9529, |
|
"step": 10990 |
|
}, |
|
{ |
|
"epoch": 2.156017248137985, |
|
"grad_norm": 7.226001262664795, |
|
"learning_rate": 9.259259259259259e-07, |
|
"loss": 0.9556, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 2.1579772638181107, |
|
"grad_norm": 8.324951171875, |
|
"learning_rate": 9.166666666666666e-07, |
|
"loss": 0.9666, |
|
"step": 11010 |
|
}, |
|
{ |
|
"epoch": 2.159937279498236, |
|
"grad_norm": 7.676420211791992, |
|
"learning_rate": 9.074074074074076e-07, |
|
"loss": 0.9822, |
|
"step": 11020 |
|
}, |
|
{ |
|
"epoch": 2.1618972951783615, |
|
"grad_norm": 8.159540176391602, |
|
"learning_rate": 8.981481481481483e-07, |
|
"loss": 0.9608, |
|
"step": 11030 |
|
}, |
|
{ |
|
"epoch": 2.1638573108584866, |
|
"grad_norm": 7.420976161956787, |
|
"learning_rate": 8.88888888888889e-07, |
|
"loss": 0.974, |
|
"step": 11040 |
|
}, |
|
{ |
|
"epoch": 2.1658173265386123, |
|
"grad_norm": 7.221758842468262, |
|
"learning_rate": 8.796296296296297e-07, |
|
"loss": 0.9679, |
|
"step": 11050 |
|
}, |
|
{ |
|
"epoch": 2.167777342218738, |
|
"grad_norm": 7.399634838104248, |
|
"learning_rate": 8.703703703703705e-07, |
|
"loss": 0.9699, |
|
"step": 11060 |
|
}, |
|
{ |
|
"epoch": 2.169737357898863, |
|
"grad_norm": 6.9496026039123535, |
|
"learning_rate": 8.611111111111112e-07, |
|
"loss": 0.963, |
|
"step": 11070 |
|
}, |
|
{ |
|
"epoch": 2.1716973735789886, |
|
"grad_norm": 7.964886665344238, |
|
"learning_rate": 8.518518518518519e-07, |
|
"loss": 0.9649, |
|
"step": 11080 |
|
}, |
|
{ |
|
"epoch": 2.1736573892591142, |
|
"grad_norm": 6.945036888122559, |
|
"learning_rate": 8.425925925925926e-07, |
|
"loss": 0.9658, |
|
"step": 11090 |
|
}, |
|
{ |
|
"epoch": 2.1756174049392394, |
|
"grad_norm": 7.315770149230957, |
|
"learning_rate": 8.333333333333333e-07, |
|
"loss": 0.9715, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 2.177577420619365, |
|
"grad_norm": 7.41797399520874, |
|
"learning_rate": 8.24074074074074e-07, |
|
"loss": 0.9579, |
|
"step": 11110 |
|
}, |
|
{ |
|
"epoch": 2.1795374362994906, |
|
"grad_norm": 6.9420366287231445, |
|
"learning_rate": 8.14814814814815e-07, |
|
"loss": 0.9785, |
|
"step": 11120 |
|
}, |
|
{ |
|
"epoch": 2.1814974519796158, |
|
"grad_norm": 6.257632255554199, |
|
"learning_rate": 8.055555555555557e-07, |
|
"loss": 0.9673, |
|
"step": 11130 |
|
}, |
|
{ |
|
"epoch": 2.1834574676597414, |
|
"grad_norm": 7.180861949920654, |
|
"learning_rate": 7.962962962962964e-07, |
|
"loss": 0.9691, |
|
"step": 11140 |
|
}, |
|
{ |
|
"epoch": 2.1854174833398665, |
|
"grad_norm": 7.286172389984131, |
|
"learning_rate": 7.870370370370371e-07, |
|
"loss": 0.9583, |
|
"step": 11150 |
|
}, |
|
{ |
|
"epoch": 2.187377499019992, |
|
"grad_norm": 7.305385589599609, |
|
"learning_rate": 7.777777777777779e-07, |
|
"loss": 0.9632, |
|
"step": 11160 |
|
}, |
|
{ |
|
"epoch": 2.1893375147001177, |
|
"grad_norm": 7.4768290519714355, |
|
"learning_rate": 7.685185185185186e-07, |
|
"loss": 0.9777, |
|
"step": 11170 |
|
}, |
|
{ |
|
"epoch": 2.191297530380243, |
|
"grad_norm": 7.44431209564209, |
|
"learning_rate": 7.592592592592593e-07, |
|
"loss": 0.9748, |
|
"step": 11180 |
|
}, |
|
{ |
|
"epoch": 2.1932575460603685, |
|
"grad_norm": 7.362459182739258, |
|
"learning_rate": 7.5e-07, |
|
"loss": 0.97, |
|
"step": 11190 |
|
}, |
|
{ |
|
"epoch": 2.195217561740494, |
|
"grad_norm": 6.118969917297363, |
|
"learning_rate": 7.407407407407407e-07, |
|
"loss": 0.967, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 2.195217561740494, |
|
"eval_loss": 1.015982985496521, |
|
"eval_runtime": 14.0402, |
|
"eval_samples_per_second": 46.58, |
|
"eval_steps_per_second": 5.84, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 2.1971775774206193, |
|
"grad_norm": 7.092718601226807, |
|
"learning_rate": 7.314814814814814e-07, |
|
"loss": 0.9661, |
|
"step": 11210 |
|
}, |
|
{ |
|
"epoch": 2.199137593100745, |
|
"grad_norm": 7.384366512298584, |
|
"learning_rate": 7.222222222222222e-07, |
|
"loss": 0.9669, |
|
"step": 11220 |
|
}, |
|
{ |
|
"epoch": 2.20109760878087, |
|
"grad_norm": 7.769095420837402, |
|
"learning_rate": 7.129629629629631e-07, |
|
"loss": 0.9674, |
|
"step": 11230 |
|
}, |
|
{ |
|
"epoch": 2.2030576244609956, |
|
"grad_norm": 6.9204864501953125, |
|
"learning_rate": 7.037037037037038e-07, |
|
"loss": 0.9661, |
|
"step": 11240 |
|
}, |
|
{ |
|
"epoch": 2.2050176401411212, |
|
"grad_norm": 6.869389057159424, |
|
"learning_rate": 6.944444444444446e-07, |
|
"loss": 0.9619, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 2.2069776558212464, |
|
"grad_norm": 7.39153528213501, |
|
"learning_rate": 6.851851851851853e-07, |
|
"loss": 0.9605, |
|
"step": 11260 |
|
}, |
|
{ |
|
"epoch": 2.208937671501372, |
|
"grad_norm": 6.61447811126709, |
|
"learning_rate": 6.75925925925926e-07, |
|
"loss": 0.9796, |
|
"step": 11270 |
|
}, |
|
{ |
|
"epoch": 2.2108976871814976, |
|
"grad_norm": 6.7030863761901855, |
|
"learning_rate": 6.666666666666667e-07, |
|
"loss": 0.96, |
|
"step": 11280 |
|
}, |
|
{ |
|
"epoch": 2.2128577028616228, |
|
"grad_norm": 7.7893829345703125, |
|
"learning_rate": 6.574074074074074e-07, |
|
"loss": 0.9571, |
|
"step": 11290 |
|
}, |
|
{ |
|
"epoch": 2.2148177185417484, |
|
"grad_norm": 7.597008228302002, |
|
"learning_rate": 6.481481481481481e-07, |
|
"loss": 0.9672, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 2.216777734221874, |
|
"grad_norm": 6.373151779174805, |
|
"learning_rate": 6.388888888888889e-07, |
|
"loss": 0.9713, |
|
"step": 11310 |
|
}, |
|
{ |
|
"epoch": 2.218737749901999, |
|
"grad_norm": 7.9771575927734375, |
|
"learning_rate": 6.296296296296296e-07, |
|
"loss": 0.9674, |
|
"step": 11320 |
|
}, |
|
{ |
|
"epoch": 2.2206977655821247, |
|
"grad_norm": 7.23372745513916, |
|
"learning_rate": 6.203703703703705e-07, |
|
"loss": 0.9787, |
|
"step": 11330 |
|
}, |
|
{ |
|
"epoch": 2.2226577812622503, |
|
"grad_norm": 7.515969276428223, |
|
"learning_rate": 6.111111111111112e-07, |
|
"loss": 0.9702, |
|
"step": 11340 |
|
}, |
|
{ |
|
"epoch": 2.2246177969423755, |
|
"grad_norm": 6.468687534332275, |
|
"learning_rate": 6.018518518518519e-07, |
|
"loss": 0.981, |
|
"step": 11350 |
|
}, |
|
{ |
|
"epoch": 2.226577812622501, |
|
"grad_norm": 6.765408992767334, |
|
"learning_rate": 5.925925925925927e-07, |
|
"loss": 0.9757, |
|
"step": 11360 |
|
}, |
|
{ |
|
"epoch": 2.2285378283026263, |
|
"grad_norm": 8.031702041625977, |
|
"learning_rate": 5.833333333333334e-07, |
|
"loss": 0.9643, |
|
"step": 11370 |
|
}, |
|
{ |
|
"epoch": 2.230497843982752, |
|
"grad_norm": 7.857892990112305, |
|
"learning_rate": 5.740740740740741e-07, |
|
"loss": 0.9712, |
|
"step": 11380 |
|
}, |
|
{ |
|
"epoch": 2.2324578596628775, |
|
"grad_norm": 6.461490154266357, |
|
"learning_rate": 5.648148148148148e-07, |
|
"loss": 0.9664, |
|
"step": 11390 |
|
}, |
|
{ |
|
"epoch": 2.2344178753430026, |
|
"grad_norm": 7.290163993835449, |
|
"learning_rate": 5.555555555555555e-07, |
|
"loss": 0.9554, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 2.2363778910231282, |
|
"grad_norm": 7.196146011352539, |
|
"learning_rate": 5.462962962962963e-07, |
|
"loss": 0.9755, |
|
"step": 11410 |
|
}, |
|
{ |
|
"epoch": 2.238337906703254, |
|
"grad_norm": 7.482301712036133, |
|
"learning_rate": 5.37037037037037e-07, |
|
"loss": 0.9577, |
|
"step": 11420 |
|
}, |
|
{ |
|
"epoch": 2.240297922383379, |
|
"grad_norm": 7.794521808624268, |
|
"learning_rate": 5.277777777777779e-07, |
|
"loss": 0.9679, |
|
"step": 11430 |
|
}, |
|
{ |
|
"epoch": 2.2422579380635046, |
|
"grad_norm": 6.857132434844971, |
|
"learning_rate": 5.185185185185186e-07, |
|
"loss": 0.9561, |
|
"step": 11440 |
|
}, |
|
{ |
|
"epoch": 2.2442179537436298, |
|
"grad_norm": 6.887381553649902, |
|
"learning_rate": 5.092592592592593e-07, |
|
"loss": 0.9712, |
|
"step": 11450 |
|
}, |
|
{ |
|
"epoch": 2.2461779694237554, |
|
"grad_norm": 7.537138938903809, |
|
"learning_rate": 5.000000000000001e-07, |
|
"loss": 0.9755, |
|
"step": 11460 |
|
}, |
|
{ |
|
"epoch": 2.248137985103881, |
|
"grad_norm": 7.604095935821533, |
|
"learning_rate": 4.907407407407408e-07, |
|
"loss": 0.968, |
|
"step": 11470 |
|
}, |
|
{ |
|
"epoch": 2.250098000784006, |
|
"grad_norm": 6.840078830718994, |
|
"learning_rate": 4.814814814814815e-07, |
|
"loss": 0.9677, |
|
"step": 11480 |
|
}, |
|
{ |
|
"epoch": 2.2520580164641317, |
|
"grad_norm": 7.0317301750183105, |
|
"learning_rate": 4.7222222222222226e-07, |
|
"loss": 0.9635, |
|
"step": 11490 |
|
}, |
|
{ |
|
"epoch": 2.2540180321442573, |
|
"grad_norm": 7.006657123565674, |
|
"learning_rate": 4.6296296296296297e-07, |
|
"loss": 0.96, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 2.2559780478243825, |
|
"grad_norm": 7.701179504394531, |
|
"learning_rate": 4.537037037037038e-07, |
|
"loss": 0.9747, |
|
"step": 11510 |
|
}, |
|
{ |
|
"epoch": 2.257938063504508, |
|
"grad_norm": 7.04123592376709, |
|
"learning_rate": 4.444444444444445e-07, |
|
"loss": 0.9632, |
|
"step": 11520 |
|
}, |
|
{ |
|
"epoch": 2.2598980791846337, |
|
"grad_norm": 7.217432022094727, |
|
"learning_rate": 4.3518518518518525e-07, |
|
"loss": 0.9675, |
|
"step": 11530 |
|
}, |
|
{ |
|
"epoch": 2.261858094864759, |
|
"grad_norm": 7.661186695098877, |
|
"learning_rate": 4.2592592592592596e-07, |
|
"loss": 0.984, |
|
"step": 11540 |
|
}, |
|
{ |
|
"epoch": 2.2638181105448845, |
|
"grad_norm": 11.790616989135742, |
|
"learning_rate": 4.1666666666666667e-07, |
|
"loss": 0.9724, |
|
"step": 11550 |
|
}, |
|
{ |
|
"epoch": 2.2657781262250096, |
|
"grad_norm": 7.301620960235596, |
|
"learning_rate": 4.074074074074075e-07, |
|
"loss": 0.9601, |
|
"step": 11560 |
|
}, |
|
{ |
|
"epoch": 2.2677381419051352, |
|
"grad_norm": 8.462876319885254, |
|
"learning_rate": 3.981481481481482e-07, |
|
"loss": 0.9752, |
|
"step": 11570 |
|
}, |
|
{ |
|
"epoch": 2.269698157585261, |
|
"grad_norm": 6.7517547607421875, |
|
"learning_rate": 3.8888888888888895e-07, |
|
"loss": 0.9762, |
|
"step": 11580 |
|
}, |
|
{ |
|
"epoch": 2.271658173265386, |
|
"grad_norm": 7.299182891845703, |
|
"learning_rate": 3.7962962962962966e-07, |
|
"loss": 0.972, |
|
"step": 11590 |
|
}, |
|
{ |
|
"epoch": 2.2736181889455116, |
|
"grad_norm": 7.082560062408447, |
|
"learning_rate": 3.7037037037037036e-07, |
|
"loss": 0.9679, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 2.2755782046256368, |
|
"grad_norm": 7.469284534454346, |
|
"learning_rate": 3.611111111111111e-07, |
|
"loss": 0.9546, |
|
"step": 11610 |
|
}, |
|
{ |
|
"epoch": 2.2775382203057624, |
|
"grad_norm": 7.5840535163879395, |
|
"learning_rate": 3.518518518518519e-07, |
|
"loss": 0.9761, |
|
"step": 11620 |
|
}, |
|
{ |
|
"epoch": 2.279498235985888, |
|
"grad_norm": 8.262914657592773, |
|
"learning_rate": 3.4259259259259265e-07, |
|
"loss": 0.9646, |
|
"step": 11630 |
|
}, |
|
{ |
|
"epoch": 2.281458251666013, |
|
"grad_norm": 8.672895431518555, |
|
"learning_rate": 3.3333333333333335e-07, |
|
"loss": 0.9732, |
|
"step": 11640 |
|
}, |
|
{ |
|
"epoch": 2.2834182673461387, |
|
"grad_norm": 7.6948347091674805, |
|
"learning_rate": 3.2407407407407406e-07, |
|
"loss": 0.9726, |
|
"step": 11650 |
|
}, |
|
{ |
|
"epoch": 2.2853782830262643, |
|
"grad_norm": 6.536133289337158, |
|
"learning_rate": 3.148148148148148e-07, |
|
"loss": 0.9694, |
|
"step": 11660 |
|
}, |
|
{ |
|
"epoch": 2.2873382987063895, |
|
"grad_norm": 8.90619945526123, |
|
"learning_rate": 3.055555555555556e-07, |
|
"loss": 0.9793, |
|
"step": 11670 |
|
}, |
|
{ |
|
"epoch": 2.289298314386515, |
|
"grad_norm": 7.702218055725098, |
|
"learning_rate": 2.9629629629629634e-07, |
|
"loss": 0.9642, |
|
"step": 11680 |
|
}, |
|
{ |
|
"epoch": 2.2912583300666407, |
|
"grad_norm": 8.119073867797852, |
|
"learning_rate": 2.8703703703703705e-07, |
|
"loss": 0.9641, |
|
"step": 11690 |
|
}, |
|
{ |
|
"epoch": 2.293218345746766, |
|
"grad_norm": 7.159200191497803, |
|
"learning_rate": 2.7777777777777776e-07, |
|
"loss": 0.9636, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 2.2951783614268915, |
|
"grad_norm": 6.998493671417236, |
|
"learning_rate": 2.685185185185185e-07, |
|
"loss": 0.9664, |
|
"step": 11710 |
|
}, |
|
{ |
|
"epoch": 2.297138377107017, |
|
"grad_norm": 7.50088357925415, |
|
"learning_rate": 2.592592592592593e-07, |
|
"loss": 0.9787, |
|
"step": 11720 |
|
}, |
|
{ |
|
"epoch": 2.2990983927871422, |
|
"grad_norm": 7.634678363800049, |
|
"learning_rate": 2.5000000000000004e-07, |
|
"loss": 0.9712, |
|
"step": 11730 |
|
}, |
|
{ |
|
"epoch": 2.301058408467268, |
|
"grad_norm": 7.635111331939697, |
|
"learning_rate": 2.4074074074074075e-07, |
|
"loss": 0.9694, |
|
"step": 11740 |
|
}, |
|
{ |
|
"epoch": 2.303018424147393, |
|
"grad_norm": 7.645463466644287, |
|
"learning_rate": 2.3148148148148148e-07, |
|
"loss": 0.9717, |
|
"step": 11750 |
|
}, |
|
{ |
|
"epoch": 2.3049784398275186, |
|
"grad_norm": 6.8600568771362305, |
|
"learning_rate": 2.2222222222222224e-07, |
|
"loss": 0.9495, |
|
"step": 11760 |
|
}, |
|
{ |
|
"epoch": 2.306938455507644, |
|
"grad_norm": 7.019852638244629, |
|
"learning_rate": 2.1296296296296298e-07, |
|
"loss": 0.9527, |
|
"step": 11770 |
|
}, |
|
{ |
|
"epoch": 2.3088984711877694, |
|
"grad_norm": 8.227087020874023, |
|
"learning_rate": 2.0370370370370374e-07, |
|
"loss": 0.9608, |
|
"step": 11780 |
|
}, |
|
{ |
|
"epoch": 2.310858486867895, |
|
"grad_norm": 7.640336036682129, |
|
"learning_rate": 1.9444444444444447e-07, |
|
"loss": 0.9684, |
|
"step": 11790 |
|
}, |
|
{ |
|
"epoch": 2.3128185025480206, |
|
"grad_norm": 7.304652690887451, |
|
"learning_rate": 1.8518518518518518e-07, |
|
"loss": 0.9698, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 2.3147785182281457, |
|
"grad_norm": 7.173428535461426, |
|
"learning_rate": 1.7592592592592594e-07, |
|
"loss": 0.9635, |
|
"step": 11810 |
|
}, |
|
{ |
|
"epoch": 2.3167385339082713, |
|
"grad_norm": 7.876226902008057, |
|
"learning_rate": 1.6666666666666668e-07, |
|
"loss": 0.9606, |
|
"step": 11820 |
|
}, |
|
{ |
|
"epoch": 2.3186985495883965, |
|
"grad_norm": 6.757920265197754, |
|
"learning_rate": 1.574074074074074e-07, |
|
"loss": 0.9692, |
|
"step": 11830 |
|
}, |
|
{ |
|
"epoch": 2.320658565268522, |
|
"grad_norm": 8.267070770263672, |
|
"learning_rate": 1.4814814814814817e-07, |
|
"loss": 0.9565, |
|
"step": 11840 |
|
}, |
|
{ |
|
"epoch": 2.3226185809486477, |
|
"grad_norm": 7.382258892059326, |
|
"learning_rate": 1.3888888888888888e-07, |
|
"loss": 0.9672, |
|
"step": 11850 |
|
}, |
|
{ |
|
"epoch": 2.324578596628773, |
|
"grad_norm": 7.452052116394043, |
|
"learning_rate": 1.2962962962962964e-07, |
|
"loss": 0.9649, |
|
"step": 11860 |
|
}, |
|
{ |
|
"epoch": 2.3265386123088985, |
|
"grad_norm": 9.273903846740723, |
|
"learning_rate": 1.2037037037037037e-07, |
|
"loss": 0.9652, |
|
"step": 11870 |
|
}, |
|
{ |
|
"epoch": 2.328498627989024, |
|
"grad_norm": 6.987192153930664, |
|
"learning_rate": 1.1111111111111112e-07, |
|
"loss": 0.9571, |
|
"step": 11880 |
|
}, |
|
{ |
|
"epoch": 2.3304586436691492, |
|
"grad_norm": 6.799464702606201, |
|
"learning_rate": 1.0185185185185187e-07, |
|
"loss": 0.9707, |
|
"step": 11890 |
|
}, |
|
{ |
|
"epoch": 2.332418659349275, |
|
"grad_norm": 7.16199254989624, |
|
"learning_rate": 9.259259259259259e-08, |
|
"loss": 0.9734, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 2.332418659349275, |
|
"eval_loss": 1.0144330263137817, |
|
"eval_runtime": 14.0924, |
|
"eval_samples_per_second": 46.408, |
|
"eval_steps_per_second": 5.819, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 2.3343786750294004, |
|
"grad_norm": 8.338574409484863, |
|
"learning_rate": 8.333333333333334e-08, |
|
"loss": 0.9714, |
|
"step": 11910 |
|
}, |
|
{ |
|
"epoch": 2.3363386907095256, |
|
"grad_norm": 6.403331279754639, |
|
"learning_rate": 7.407407407407409e-08, |
|
"loss": 0.959, |
|
"step": 11920 |
|
}, |
|
{ |
|
"epoch": 2.338298706389651, |
|
"grad_norm": 8.215003967285156, |
|
"learning_rate": 6.481481481481482e-08, |
|
"loss": 0.964, |
|
"step": 11930 |
|
}, |
|
{ |
|
"epoch": 2.3402587220697764, |
|
"grad_norm": 7.930845737457275, |
|
"learning_rate": 5.555555555555556e-08, |
|
"loss": 0.9575, |
|
"step": 11940 |
|
}, |
|
{ |
|
"epoch": 2.342218737749902, |
|
"grad_norm": 7.033510684967041, |
|
"learning_rate": 4.6296296296296295e-08, |
|
"loss": 0.9626, |
|
"step": 11950 |
|
}, |
|
{ |
|
"epoch": 2.3441787534300276, |
|
"grad_norm": 7.8353800773620605, |
|
"learning_rate": 3.703703703703704e-08, |
|
"loss": 0.9673, |
|
"step": 11960 |
|
}, |
|
{ |
|
"epoch": 2.3461387691101527, |
|
"grad_norm": 7.304898738861084, |
|
"learning_rate": 2.777777777777778e-08, |
|
"loss": 0.9618, |
|
"step": 11970 |
|
}, |
|
{ |
|
"epoch": 2.3480987847902783, |
|
"grad_norm": 7.420264720916748, |
|
"learning_rate": 1.851851851851852e-08, |
|
"loss": 0.9544, |
|
"step": 11980 |
|
}, |
|
{ |
|
"epoch": 2.350058800470404, |
|
"grad_norm": 7.217324256896973, |
|
"learning_rate": 9.25925925925926e-09, |
|
"loss": 0.9752, |
|
"step": 11990 |
|
}, |
|
{ |
|
"epoch": 2.352018816150529, |
|
"grad_norm": 7.595900058746338, |
|
"learning_rate": 0.0, |
|
"loss": 0.9548, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 2.352018816150529, |
|
"eval_loss": 1.0147355794906616, |
|
"eval_runtime": 14.1258, |
|
"eval_samples_per_second": 46.298, |
|
"eval_steps_per_second": 5.805, |
|
"step": 12000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 12000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 1, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.2834751593089335e+19, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|