ErrorAI commited on
Commit
41e9e58
·
verified ·
1 Parent(s): 8f767fc

Training in progress, step 372, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:58100fe655c8ed92d4518fef45e29df4d60c7e291f66f716456f73e4ea77f392
3
  size 144805440
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4db2aaaf0e71f4163eb1948613e8b578254a0de5a98794daab0fd666ff0e9335
3
  size 144805440
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab152d90f6d47ebd8a356ceecca0993fb077dcff867c7c58ca00456e2cfcd04b
3
- size 74291604
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97e66fb0be2050837da15f447248f80a1c314ca1bc2a805d9029af38817476ae
3
+ size 74292308
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f660891b19594633ced246d59eedd400fe2556d319f4e5ca333df7fb57888180
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:956eaecea1513e07a98b3b792863d7c1c440c5da69923f2e87c397dfc0da01f4
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:81771ff96e80b84ed048126e169640f8617ceb476fe2f91b8561190057e53b0d
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6484c651d3d8bf75888a89e2001dae4da70b271bc6cafa91a994f199bf79e17
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5002521432173475,
5
  "eval_steps": 500,
6
- "global_step": 248,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1743,6 +1743,874 @@
1743
  "learning_rate": 5.079976035714976e-05,
1744
  "loss": 0.7198,
1745
  "step": 248
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1746
  }
1747
  ],
1748
  "logging_steps": 1,
@@ -1762,7 +2630,7 @@
1762
  "attributes": {}
1763
  }
1764
  },
1765
- "total_flos": 4.194181341983539e+17,
1766
  "train_batch_size": 4,
1767
  "trial_name": null,
1768
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.7503782148260212,
5
  "eval_steps": 500,
6
+ "global_step": 372,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1743
  "learning_rate": 5.079976035714976e-05,
1744
  "loss": 0.7198,
1745
  "step": 248
1746
+ },
1747
+ {
1748
+ "epoch": 0.5022692889561271,
1749
+ "grad_norm": 0.6312721371650696,
1750
+ "learning_rate": 5.047986931116205e-05,
1751
+ "loss": 0.7787,
1752
+ "step": 249
1753
+ },
1754
+ {
1755
+ "epoch": 0.5042864346949067,
1756
+ "grad_norm": 0.6478500962257385,
1757
+ "learning_rate": 5.015995861989287e-05,
1758
+ "loss": 0.6478,
1759
+ "step": 250
1760
+ },
1761
+ {
1762
+ "epoch": 0.5063035804336863,
1763
+ "grad_norm": 0.370857834815979,
1764
+ "learning_rate": 4.984004138010715e-05,
1765
+ "loss": 0.4671,
1766
+ "step": 251
1767
+ },
1768
+ {
1769
+ "epoch": 0.5083207261724659,
1770
+ "grad_norm": 0.432366281747818,
1771
+ "learning_rate": 4.952013068883795e-05,
1772
+ "loss": 0.6026,
1773
+ "step": 252
1774
+ },
1775
+ {
1776
+ "epoch": 0.5103378719112456,
1777
+ "grad_norm": 0.4328523576259613,
1778
+ "learning_rate": 4.920023964285025e-05,
1779
+ "loss": 0.6082,
1780
+ "step": 253
1781
+ },
1782
+ {
1783
+ "epoch": 0.5123550176500252,
1784
+ "grad_norm": 0.40599900484085083,
1785
+ "learning_rate": 4.8880381338104777e-05,
1786
+ "loss": 0.4708,
1787
+ "step": 254
1788
+ },
1789
+ {
1790
+ "epoch": 0.5143721633888049,
1791
+ "grad_norm": 0.43550002574920654,
1792
+ "learning_rate": 4.8560568869221805e-05,
1793
+ "loss": 0.5816,
1794
+ "step": 255
1795
+ },
1796
+ {
1797
+ "epoch": 0.5163893091275845,
1798
+ "grad_norm": 0.40016648173332214,
1799
+ "learning_rate": 4.8240815328945226e-05,
1800
+ "loss": 0.5691,
1801
+ "step": 256
1802
+ },
1803
+ {
1804
+ "epoch": 0.518406454866364,
1805
+ "grad_norm": 0.437029629945755,
1806
+ "learning_rate": 4.7921133807606424e-05,
1807
+ "loss": 0.5555,
1808
+ "step": 257
1809
+ },
1810
+ {
1811
+ "epoch": 0.5204236006051437,
1812
+ "grad_norm": 0.39981645345687866,
1813
+ "learning_rate": 4.760153739258843e-05,
1814
+ "loss": 0.5786,
1815
+ "step": 258
1816
+ },
1817
+ {
1818
+ "epoch": 0.5224407463439233,
1819
+ "grad_norm": 0.4225994646549225,
1820
+ "learning_rate": 4.728203916779009e-05,
1821
+ "loss": 0.5744,
1822
+ "step": 259
1823
+ },
1824
+ {
1825
+ "epoch": 0.524457892082703,
1826
+ "grad_norm": 0.4146437346935272,
1827
+ "learning_rate": 4.69626522130905e-05,
1828
+ "loss": 0.5433,
1829
+ "step": 260
1830
+ },
1831
+ {
1832
+ "epoch": 0.5264750378214826,
1833
+ "grad_norm": 0.41293078660964966,
1834
+ "learning_rate": 4.6643389603813486e-05,
1835
+ "loss": 0.5461,
1836
+ "step": 261
1837
+ },
1838
+ {
1839
+ "epoch": 0.5284921835602622,
1840
+ "grad_norm": 0.4155355989933014,
1841
+ "learning_rate": 4.632426441019227e-05,
1842
+ "loss": 0.5554,
1843
+ "step": 262
1844
+ },
1845
+ {
1846
+ "epoch": 0.5305093292990418,
1847
+ "grad_norm": 0.4077325761318207,
1848
+ "learning_rate": 4.600528969683448e-05,
1849
+ "loss": 0.4872,
1850
+ "step": 263
1851
+ },
1852
+ {
1853
+ "epoch": 0.5325264750378215,
1854
+ "grad_norm": 0.4461796283721924,
1855
+ "learning_rate": 4.568647852218725e-05,
1856
+ "loss": 0.5266,
1857
+ "step": 264
1858
+ },
1859
+ {
1860
+ "epoch": 0.5345436207766011,
1861
+ "grad_norm": 0.40012630820274353,
1862
+ "learning_rate": 4.5367843938002694e-05,
1863
+ "loss": 0.5459,
1864
+ "step": 265
1865
+ },
1866
+ {
1867
+ "epoch": 0.5365607665153808,
1868
+ "grad_norm": 0.45490312576293945,
1869
+ "learning_rate": 4.504939898880339e-05,
1870
+ "loss": 0.6255,
1871
+ "step": 266
1872
+ },
1873
+ {
1874
+ "epoch": 0.5385779122541604,
1875
+ "grad_norm": 0.41126129031181335,
1876
+ "learning_rate": 4.473115671134859e-05,
1877
+ "loss": 0.5786,
1878
+ "step": 267
1879
+ },
1880
+ {
1881
+ "epoch": 0.5405950579929399,
1882
+ "grad_norm": 0.3975431025028229,
1883
+ "learning_rate": 4.441313013410039e-05,
1884
+ "loss": 0.5707,
1885
+ "step": 268
1886
+ },
1887
+ {
1888
+ "epoch": 0.5426122037317196,
1889
+ "grad_norm": 0.40477657318115234,
1890
+ "learning_rate": 4.409533227669033e-05,
1891
+ "loss": 0.5079,
1892
+ "step": 269
1893
+ },
1894
+ {
1895
+ "epoch": 0.5446293494704992,
1896
+ "grad_norm": 0.4053346514701843,
1897
+ "learning_rate": 4.377777614938647e-05,
1898
+ "loss": 0.6502,
1899
+ "step": 270
1900
+ },
1901
+ {
1902
+ "epoch": 0.5466464952092789,
1903
+ "grad_norm": 0.4417206346988678,
1904
+ "learning_rate": 4.3460474752560724e-05,
1905
+ "loss": 0.6215,
1906
+ "step": 271
1907
+ },
1908
+ {
1909
+ "epoch": 0.5486636409480585,
1910
+ "grad_norm": 0.4087637960910797,
1911
+ "learning_rate": 4.314344107615665e-05,
1912
+ "loss": 0.5612,
1913
+ "step": 272
1914
+ },
1915
+ {
1916
+ "epoch": 0.5506807866868382,
1917
+ "grad_norm": 0.3786230683326721,
1918
+ "learning_rate": 4.282668809915758e-05,
1919
+ "loss": 0.5299,
1920
+ "step": 273
1921
+ },
1922
+ {
1923
+ "epoch": 0.5526979324256177,
1924
+ "grad_norm": 0.3989951014518738,
1925
+ "learning_rate": 4.251022878905543e-05,
1926
+ "loss": 0.5832,
1927
+ "step": 274
1928
+ },
1929
+ {
1930
+ "epoch": 0.5547150781643974,
1931
+ "grad_norm": 0.45247653126716614,
1932
+ "learning_rate": 4.219407610131971e-05,
1933
+ "loss": 0.5932,
1934
+ "step": 275
1935
+ },
1936
+ {
1937
+ "epoch": 0.556732223903177,
1938
+ "grad_norm": 0.40874481201171875,
1939
+ "learning_rate": 4.187824297886715e-05,
1940
+ "loss": 0.6128,
1941
+ "step": 276
1942
+ },
1943
+ {
1944
+ "epoch": 0.5587493696419567,
1945
+ "grad_norm": 0.42016932368278503,
1946
+ "learning_rate": 4.156274235153189e-05,
1947
+ "loss": 0.536,
1948
+ "step": 277
1949
+ },
1950
+ {
1951
+ "epoch": 0.5607665153807363,
1952
+ "grad_norm": 0.4106806814670563,
1953
+ "learning_rate": 4.1247587135536126e-05,
1954
+ "loss": 0.587,
1955
+ "step": 278
1956
+ },
1957
+ {
1958
+ "epoch": 0.5627836611195158,
1959
+ "grad_norm": 0.4463154077529907,
1960
+ "learning_rate": 4.0932790232961235e-05,
1961
+ "loss": 0.5121,
1962
+ "step": 279
1963
+ },
1964
+ {
1965
+ "epoch": 0.5648008068582955,
1966
+ "grad_norm": 0.45139235258102417,
1967
+ "learning_rate": 4.0618364531219775e-05,
1968
+ "loss": 0.5219,
1969
+ "step": 280
1970
+ },
1971
+ {
1972
+ "epoch": 0.5668179525970751,
1973
+ "grad_norm": 0.45914503931999207,
1974
+ "learning_rate": 4.030432290252771e-05,
1975
+ "loss": 0.6114,
1976
+ "step": 281
1977
+ },
1978
+ {
1979
+ "epoch": 0.5688350983358548,
1980
+ "grad_norm": 0.43565309047698975,
1981
+ "learning_rate": 3.999067820337757e-05,
1982
+ "loss": 0.577,
1983
+ "step": 282
1984
+ },
1985
+ {
1986
+ "epoch": 0.5708522440746344,
1987
+ "grad_norm": 0.4499742090702057,
1988
+ "learning_rate": 3.967744327401197e-05,
1989
+ "loss": 0.537,
1990
+ "step": 283
1991
+ },
1992
+ {
1993
+ "epoch": 0.5728693898134141,
1994
+ "grad_norm": 0.44511550664901733,
1995
+ "learning_rate": 3.936463093789813e-05,
1996
+ "loss": 0.6069,
1997
+ "step": 284
1998
+ },
1999
+ {
2000
+ "epoch": 0.5748865355521936,
2001
+ "grad_norm": 0.4442926347255707,
2002
+ "learning_rate": 3.9052254001202746e-05,
2003
+ "loss": 0.601,
2004
+ "step": 285
2005
+ },
2006
+ {
2007
+ "epoch": 0.5769036812909732,
2008
+ "grad_norm": 0.4780077636241913,
2009
+ "learning_rate": 3.8740325252267785e-05,
2010
+ "loss": 0.5617,
2011
+ "step": 286
2012
+ },
2013
+ {
2014
+ "epoch": 0.5789208270297529,
2015
+ "grad_norm": 0.4674513638019562,
2016
+ "learning_rate": 3.842885746108693e-05,
2017
+ "loss": 0.6245,
2018
+ "step": 287
2019
+ },
2020
+ {
2021
+ "epoch": 0.5809379727685325,
2022
+ "grad_norm": 0.49114686250686646,
2023
+ "learning_rate": 3.811786337878284e-05,
2024
+ "loss": 0.6625,
2025
+ "step": 288
2026
+ },
2027
+ {
2028
+ "epoch": 0.5829551185073122,
2029
+ "grad_norm": 0.5057556629180908,
2030
+ "learning_rate": 3.780735573708508e-05,
2031
+ "loss": 0.6384,
2032
+ "step": 289
2033
+ },
2034
+ {
2035
+ "epoch": 0.5849722642460918,
2036
+ "grad_norm": 0.4727246165275574,
2037
+ "learning_rate": 3.7497347247808846e-05,
2038
+ "loss": 0.7091,
2039
+ "step": 290
2040
+ },
2041
+ {
2042
+ "epoch": 0.5869894099848714,
2043
+ "grad_norm": 0.5270060300827026,
2044
+ "learning_rate": 3.718785060233471e-05,
2045
+ "loss": 0.673,
2046
+ "step": 291
2047
+ },
2048
+ {
2049
+ "epoch": 0.589006555723651,
2050
+ "grad_norm": 0.5089999437332153,
2051
+ "learning_rate": 3.687887847108894e-05,
2052
+ "loss": 0.6378,
2053
+ "step": 292
2054
+ },
2055
+ {
2056
+ "epoch": 0.5910237014624307,
2057
+ "grad_norm": 0.5260120630264282,
2058
+ "learning_rate": 3.657044350302479e-05,
2059
+ "loss": 0.6278,
2060
+ "step": 293
2061
+ },
2062
+ {
2063
+ "epoch": 0.5930408472012103,
2064
+ "grad_norm": 0.5153591632843018,
2065
+ "learning_rate": 3.6262558325104695e-05,
2066
+ "loss": 0.6463,
2067
+ "step": 294
2068
+ },
2069
+ {
2070
+ "epoch": 0.59505799293999,
2071
+ "grad_norm": 0.5067986845970154,
2072
+ "learning_rate": 3.595523554178336e-05,
2073
+ "loss": 0.6946,
2074
+ "step": 295
2075
+ },
2076
+ {
2077
+ "epoch": 0.5970751386787695,
2078
+ "grad_norm": 0.5289713740348816,
2079
+ "learning_rate": 3.564848773449172e-05,
2080
+ "loss": 0.6358,
2081
+ "step": 296
2082
+ },
2083
+ {
2084
+ "epoch": 0.5990922844175491,
2085
+ "grad_norm": 0.5743799805641174,
2086
+ "learning_rate": 3.5342327461121805e-05,
2087
+ "loss": 0.6664,
2088
+ "step": 297
2089
+ },
2090
+ {
2091
+ "epoch": 0.6011094301563288,
2092
+ "grad_norm": 0.6473139524459839,
2093
+ "learning_rate": 3.503676725551278e-05,
2094
+ "loss": 0.6851,
2095
+ "step": 298
2096
+ },
2097
+ {
2098
+ "epoch": 0.6031265758951084,
2099
+ "grad_norm": 0.6861950755119324,
2100
+ "learning_rate": 3.473181962693773e-05,
2101
+ "loss": 0.6171,
2102
+ "step": 299
2103
+ },
2104
+ {
2105
+ "epoch": 0.6051437216338881,
2106
+ "grad_norm": 0.6407880783081055,
2107
+ "learning_rate": 3.442749705959152e-05,
2108
+ "loss": 0.5958,
2109
+ "step": 300
2110
+ },
2111
+ {
2112
+ "epoch": 0.6071608673726677,
2113
+ "grad_norm": 0.3541804552078247,
2114
+ "learning_rate": 3.412381201207979e-05,
2115
+ "loss": 0.5327,
2116
+ "step": 301
2117
+ },
2118
+ {
2119
+ "epoch": 0.6091780131114473,
2120
+ "grad_norm": 0.3857956528663635,
2121
+ "learning_rate": 3.3820776916908857e-05,
2122
+ "loss": 0.5675,
2123
+ "step": 302
2124
+ },
2125
+ {
2126
+ "epoch": 0.6111951588502269,
2127
+ "grad_norm": 0.41454771161079407,
2128
+ "learning_rate": 3.351840417997679e-05,
2129
+ "loss": 0.5419,
2130
+ "step": 303
2131
+ },
2132
+ {
2133
+ "epoch": 0.6132123045890066,
2134
+ "grad_norm": 0.41526558995246887,
2135
+ "learning_rate": 3.321670618006543e-05,
2136
+ "loss": 0.5429,
2137
+ "step": 304
2138
+ },
2139
+ {
2140
+ "epoch": 0.6152294503277862,
2141
+ "grad_norm": 0.4046734571456909,
2142
+ "learning_rate": 3.291569526833372e-05,
2143
+ "loss": 0.5813,
2144
+ "step": 305
2145
+ },
2146
+ {
2147
+ "epoch": 0.6172465960665658,
2148
+ "grad_norm": 0.43739229440689087,
2149
+ "learning_rate": 3.2615383767812056e-05,
2150
+ "loss": 0.6181,
2151
+ "step": 306
2152
+ },
2153
+ {
2154
+ "epoch": 0.6192637418053455,
2155
+ "grad_norm": 0.42888522148132324,
2156
+ "learning_rate": 3.231578397289772e-05,
2157
+ "loss": 0.492,
2158
+ "step": 307
2159
+ },
2160
+ {
2161
+ "epoch": 0.621280887544125,
2162
+ "grad_norm": 0.4459590017795563,
2163
+ "learning_rate": 3.2016908148851624e-05,
2164
+ "loss": 0.5323,
2165
+ "step": 308
2166
+ },
2167
+ {
2168
+ "epoch": 0.6232980332829047,
2169
+ "grad_norm": 0.440335750579834,
2170
+ "learning_rate": 3.1718768531296196e-05,
2171
+ "loss": 0.5439,
2172
+ "step": 309
2173
+ },
2174
+ {
2175
+ "epoch": 0.6253151790216843,
2176
+ "grad_norm": 0.422626793384552,
2177
+ "learning_rate": 3.142137732571437e-05,
2178
+ "loss": 0.538,
2179
+ "step": 310
2180
+ },
2181
+ {
2182
+ "epoch": 0.627332324760464,
2183
+ "grad_norm": 0.3488753139972687,
2184
+ "learning_rate": 3.112474670695008e-05,
2185
+ "loss": 0.4664,
2186
+ "step": 311
2187
+ },
2188
+ {
2189
+ "epoch": 0.6293494704992436,
2190
+ "grad_norm": 0.4092547297477722,
2191
+ "learning_rate": 3.0828888818709656e-05,
2192
+ "loss": 0.4762,
2193
+ "step": 312
2194
+ },
2195
+ {
2196
+ "epoch": 0.6313666162380231,
2197
+ "grad_norm": 0.3810945749282837,
2198
+ "learning_rate": 3.053381577306481e-05,
2199
+ "loss": 0.4807,
2200
+ "step": 313
2201
+ },
2202
+ {
2203
+ "epoch": 0.6333837619768028,
2204
+ "grad_norm": 0.3901941180229187,
2205
+ "learning_rate": 3.0239539649956665e-05,
2206
+ "loss": 0.505,
2207
+ "step": 314
2208
+ },
2209
+ {
2210
+ "epoch": 0.6354009077155824,
2211
+ "grad_norm": 0.3677188456058502,
2212
+ "learning_rate": 2.9946072496701334e-05,
2213
+ "loss": 0.5334,
2214
+ "step": 315
2215
+ },
2216
+ {
2217
+ "epoch": 0.6374180534543621,
2218
+ "grad_norm": 0.3920693099498749,
2219
+ "learning_rate": 2.9653426327496647e-05,
2220
+ "loss": 0.5632,
2221
+ "step": 316
2222
+ },
2223
+ {
2224
+ "epoch": 0.6394351991931417,
2225
+ "grad_norm": 0.3857486844062805,
2226
+ "learning_rate": 2.9361613122930304e-05,
2227
+ "loss": 0.5151,
2228
+ "step": 317
2229
+ },
2230
+ {
2231
+ "epoch": 0.6414523449319214,
2232
+ "grad_norm": 0.4166855812072754,
2233
+ "learning_rate": 2.9070644829489434e-05,
2234
+ "loss": 0.534,
2235
+ "step": 318
2236
+ },
2237
+ {
2238
+ "epoch": 0.6434694906707009,
2239
+ "grad_norm": 0.40017688274383545,
2240
+ "learning_rate": 2.8780533359071504e-05,
2241
+ "loss": 0.4967,
2242
+ "step": 319
2243
+ },
2244
+ {
2245
+ "epoch": 0.6454866364094806,
2246
+ "grad_norm": 0.45257654786109924,
2247
+ "learning_rate": 2.8491290588496668e-05,
2248
+ "loss": 0.5979,
2249
+ "step": 320
2250
+ },
2251
+ {
2252
+ "epoch": 0.6475037821482602,
2253
+ "grad_norm": 0.4371342360973358,
2254
+ "learning_rate": 2.820292835902148e-05,
2255
+ "loss": 0.5411,
2256
+ "step": 321
2257
+ },
2258
+ {
2259
+ "epoch": 0.6495209278870399,
2260
+ "grad_norm": 0.4240793287754059,
2261
+ "learning_rate": 2.7915458475854283e-05,
2262
+ "loss": 0.5436,
2263
+ "step": 322
2264
+ },
2265
+ {
2266
+ "epoch": 0.6515380736258195,
2267
+ "grad_norm": 0.41990602016448975,
2268
+ "learning_rate": 2.762889270767175e-05,
2269
+ "loss": 0.523,
2270
+ "step": 323
2271
+ },
2272
+ {
2273
+ "epoch": 0.653555219364599,
2274
+ "grad_norm": 0.45127952098846436,
2275
+ "learning_rate": 2.7343242786137168e-05,
2276
+ "loss": 0.5283,
2277
+ "step": 324
2278
+ },
2279
+ {
2280
+ "epoch": 0.6555723651033787,
2281
+ "grad_norm": 0.4676419496536255,
2282
+ "learning_rate": 2.7058520405420123e-05,
2283
+ "loss": 0.5815,
2284
+ "step": 325
2285
+ },
2286
+ {
2287
+ "epoch": 0.6575895108421583,
2288
+ "grad_norm": 0.42602765560150146,
2289
+ "learning_rate": 2.677473722171786e-05,
2290
+ "loss": 0.5368,
2291
+ "step": 326
2292
+ },
2293
+ {
2294
+ "epoch": 0.659606656580938,
2295
+ "grad_norm": 0.43028366565704346,
2296
+ "learning_rate": 2.649190485277792e-05,
2297
+ "loss": 0.5435,
2298
+ "step": 327
2299
+ },
2300
+ {
2301
+ "epoch": 0.6616238023197176,
2302
+ "grad_norm": 0.4180251955986023,
2303
+ "learning_rate": 2.621003487742264e-05,
2304
+ "loss": 0.529,
2305
+ "step": 328
2306
+ },
2307
+ {
2308
+ "epoch": 0.6636409480584973,
2309
+ "grad_norm": 0.4283643364906311,
2310
+ "learning_rate": 2.5929138835075152e-05,
2311
+ "loss": 0.6257,
2312
+ "step": 329
2313
+ },
2314
+ {
2315
+ "epoch": 0.6656580937972768,
2316
+ "grad_norm": 0.4437110424041748,
2317
+ "learning_rate": 2.564922822528686e-05,
2318
+ "loss": 0.5602,
2319
+ "step": 330
2320
+ },
2321
+ {
2322
+ "epoch": 0.6676752395360565,
2323
+ "grad_norm": 0.464622437953949,
2324
+ "learning_rate": 2.5370314507266756e-05,
2325
+ "loss": 0.5834,
2326
+ "step": 331
2327
+ },
2328
+ {
2329
+ "epoch": 0.6696923852748361,
2330
+ "grad_norm": 0.4433290958404541,
2331
+ "learning_rate": 2.5092409099412227e-05,
2332
+ "loss": 0.5993,
2333
+ "step": 332
2334
+ },
2335
+ {
2336
+ "epoch": 0.6717095310136157,
2337
+ "grad_norm": 0.4574563503265381,
2338
+ "learning_rate": 2.4815523378841726e-05,
2339
+ "loss": 0.6211,
2340
+ "step": 333
2341
+ },
2342
+ {
2343
+ "epoch": 0.6737266767523954,
2344
+ "grad_norm": 0.43739160895347595,
2345
+ "learning_rate": 2.4539668680928784e-05,
2346
+ "loss": 0.4989,
2347
+ "step": 334
2348
+ },
2349
+ {
2350
+ "epoch": 0.675743822491175,
2351
+ "grad_norm": 0.4508601129055023,
2352
+ "learning_rate": 2.4264856298838213e-05,
2353
+ "loss": 0.6101,
2354
+ "step": 335
2355
+ },
2356
+ {
2357
+ "epoch": 0.6777609682299546,
2358
+ "grad_norm": 0.4525294005870819,
2359
+ "learning_rate": 2.399109748306355e-05,
2360
+ "loss": 0.5605,
2361
+ "step": 336
2362
+ },
2363
+ {
2364
+ "epoch": 0.6797781139687342,
2365
+ "grad_norm": 0.444223552942276,
2366
+ "learning_rate": 2.371840344096665e-05,
2367
+ "loss": 0.5952,
2368
+ "step": 337
2369
+ },
2370
+ {
2371
+ "epoch": 0.6817952597075139,
2372
+ "grad_norm": 0.4897475242614746,
2373
+ "learning_rate": 2.3446785336318754e-05,
2374
+ "loss": 0.5829,
2375
+ "step": 338
2376
+ },
2377
+ {
2378
+ "epoch": 0.6838124054462935,
2379
+ "grad_norm": 0.48455387353897095,
2380
+ "learning_rate": 2.317625428884348e-05,
2381
+ "loss": 0.6363,
2382
+ "step": 339
2383
+ },
2384
+ {
2385
+ "epoch": 0.6858295511850732,
2386
+ "grad_norm": 0.5362589955329895,
2387
+ "learning_rate": 2.290682137376169e-05,
2388
+ "loss": 0.6597,
2389
+ "step": 340
2390
+ },
2391
+ {
2392
+ "epoch": 0.6878466969238527,
2393
+ "grad_norm": 0.518014669418335,
2394
+ "learning_rate": 2.263849762133788e-05,
2395
+ "loss": 0.5839,
2396
+ "step": 341
2397
+ },
2398
+ {
2399
+ "epoch": 0.6898638426626323,
2400
+ "grad_norm": 0.504257082939148,
2401
+ "learning_rate": 2.237129401642887e-05,
2402
+ "loss": 0.6091,
2403
+ "step": 342
2404
+ },
2405
+ {
2406
+ "epoch": 0.691880988401412,
2407
+ "grad_norm": 0.5075199007987976,
2408
+ "learning_rate": 2.2105221498033862e-05,
2409
+ "loss": 0.5976,
2410
+ "step": 343
2411
+ },
2412
+ {
2413
+ "epoch": 0.6938981341401916,
2414
+ "grad_norm": 0.5348207354545593,
2415
+ "learning_rate": 2.1840290958846816e-05,
2416
+ "loss": 0.6331,
2417
+ "step": 344
2418
+ },
2419
+ {
2420
+ "epoch": 0.6959152798789713,
2421
+ "grad_norm": 0.6212106347084045,
2422
+ "learning_rate": 2.157651324481033e-05,
2423
+ "loss": 0.7058,
2424
+ "step": 345
2425
+ },
2426
+ {
2427
+ "epoch": 0.6979324256177509,
2428
+ "grad_norm": 0.5681262612342834,
2429
+ "learning_rate": 2.131389915467173e-05,
2430
+ "loss": 0.6257,
2431
+ "step": 346
2432
+ },
2433
+ {
2434
+ "epoch": 0.6999495713565305,
2435
+ "grad_norm": 0.5732640624046326,
2436
+ "learning_rate": 2.1052459439541005e-05,
2437
+ "loss": 0.6812,
2438
+ "step": 347
2439
+ },
2440
+ {
2441
+ "epoch": 0.7019667170953101,
2442
+ "grad_norm": 0.5906322002410889,
2443
+ "learning_rate": 2.0792204802450515e-05,
2444
+ "loss": 0.6659,
2445
+ "step": 348
2446
+ },
2447
+ {
2448
+ "epoch": 0.7039838628340898,
2449
+ "grad_norm": 0.6746000051498413,
2450
+ "learning_rate": 2.0533145897917057e-05,
2451
+ "loss": 0.6781,
2452
+ "step": 349
2453
+ },
2454
+ {
2455
+ "epoch": 0.7060010085728694,
2456
+ "grad_norm": 0.7686178684234619,
2457
+ "learning_rate": 2.0275293331505436e-05,
2458
+ "loss": 0.7789,
2459
+ "step": 350
2460
+ },
2461
+ {
2462
+ "epoch": 0.708018154311649,
2463
+ "grad_norm": 0.3885047733783722,
2464
+ "learning_rate": 2.0018657659394496e-05,
2465
+ "loss": 0.5026,
2466
+ "step": 351
2467
+ },
2468
+ {
2469
+ "epoch": 0.7100353000504287,
2470
+ "grad_norm": 0.4007488489151001,
2471
+ "learning_rate": 1.976324938794482e-05,
2472
+ "loss": 0.5077,
2473
+ "step": 352
2474
+ },
2475
+ {
2476
+ "epoch": 0.7120524457892082,
2477
+ "grad_norm": 0.4343924820423126,
2478
+ "learning_rate": 1.9509078973268645e-05,
2479
+ "loss": 0.5361,
2480
+ "step": 353
2481
+ },
2482
+ {
2483
+ "epoch": 0.7140695915279879,
2484
+ "grad_norm": 0.42959120869636536,
2485
+ "learning_rate": 1.9256156820801895e-05,
2486
+ "loss": 0.551,
2487
+ "step": 354
2488
+ },
2489
+ {
2490
+ "epoch": 0.7160867372667675,
2491
+ "grad_norm": 0.4766654372215271,
2492
+ "learning_rate": 1.9004493284877995e-05,
2493
+ "loss": 0.5484,
2494
+ "step": 355
2495
+ },
2496
+ {
2497
+ "epoch": 0.7181038830055472,
2498
+ "grad_norm": 0.41945040225982666,
2499
+ "learning_rate": 1.875409866830422e-05,
2500
+ "loss": 0.5211,
2501
+ "step": 356
2502
+ },
2503
+ {
2504
+ "epoch": 0.7201210287443268,
2505
+ "grad_norm": 0.4797287881374359,
2506
+ "learning_rate": 1.850498322193972e-05,
2507
+ "loss": 0.531,
2508
+ "step": 357
2509
+ },
2510
+ {
2511
+ "epoch": 0.7221381744831064,
2512
+ "grad_norm": 0.4228487014770508,
2513
+ "learning_rate": 1.825715714427594e-05,
2514
+ "loss": 0.5317,
2515
+ "step": 358
2516
+ },
2517
+ {
2518
+ "epoch": 0.724155320221886,
2519
+ "grad_norm": 0.4410184919834137,
2520
+ "learning_rate": 1.8010630581019095e-05,
2521
+ "loss": 0.5857,
2522
+ "step": 359
2523
+ },
2524
+ {
2525
+ "epoch": 0.7261724659606656,
2526
+ "grad_norm": 0.3858675956726074,
2527
+ "learning_rate": 1.7765413624674866e-05,
2528
+ "loss": 0.4873,
2529
+ "step": 360
2530
+ },
2531
+ {
2532
+ "epoch": 0.7281896116994453,
2533
+ "grad_norm": 0.42531728744506836,
2534
+ "learning_rate": 1.752151631413511e-05,
2535
+ "loss": 0.4964,
2536
+ "step": 361
2537
+ },
2538
+ {
2539
+ "epoch": 0.7302067574382249,
2540
+ "grad_norm": 0.4145807921886444,
2541
+ "learning_rate": 1.7278948634266968e-05,
2542
+ "loss": 0.5002,
2543
+ "step": 362
2544
+ },
2545
+ {
2546
+ "epoch": 0.7322239031770046,
2547
+ "grad_norm": 0.41224363446235657,
2548
+ "learning_rate": 1.703772051550412e-05,
2549
+ "loss": 0.5269,
2550
+ "step": 363
2551
+ },
2552
+ {
2553
+ "epoch": 0.7342410489157841,
2554
+ "grad_norm": 0.3786209225654602,
2555
+ "learning_rate": 1.679784183344014e-05,
2556
+ "loss": 0.4877,
2557
+ "step": 364
2558
+ },
2559
+ {
2560
+ "epoch": 0.7362581946545638,
2561
+ "grad_norm": 0.4014703035354614,
2562
+ "learning_rate": 1.6559322408424287e-05,
2563
+ "loss": 0.4801,
2564
+ "step": 365
2565
+ },
2566
+ {
2567
+ "epoch": 0.7382753403933434,
2568
+ "grad_norm": 0.4308634400367737,
2569
+ "learning_rate": 1.6322172005159435e-05,
2570
+ "loss": 0.5762,
2571
+ "step": 366
2572
+ },
2573
+ {
2574
+ "epoch": 0.7402924861321231,
2575
+ "grad_norm": 0.39501944184303284,
2576
+ "learning_rate": 1.608640033230236e-05,
2577
+ "loss": 0.5248,
2578
+ "step": 367
2579
+ },
2580
+ {
2581
+ "epoch": 0.7423096318709027,
2582
+ "grad_norm": 0.44487130641937256,
2583
+ "learning_rate": 1.5852017042066214e-05,
2584
+ "loss": 0.5026,
2585
+ "step": 368
2586
+ },
2587
+ {
2588
+ "epoch": 0.7443267776096822,
2589
+ "grad_norm": 0.41640734672546387,
2590
+ "learning_rate": 1.5619031729825402e-05,
2591
+ "loss": 0.5583,
2592
+ "step": 369
2593
+ },
2594
+ {
2595
+ "epoch": 0.7463439233484619,
2596
+ "grad_norm": 0.42025187611579895,
2597
+ "learning_rate": 1.538745393372281e-05,
2598
+ "loss": 0.5471,
2599
+ "step": 370
2600
+ },
2601
+ {
2602
+ "epoch": 0.7483610690872415,
2603
+ "grad_norm": 0.42271459102630615,
2604
+ "learning_rate": 1.5157293134279244e-05,
2605
+ "loss": 0.5239,
2606
+ "step": 371
2607
+ },
2608
+ {
2609
+ "epoch": 0.7503782148260212,
2610
+ "grad_norm": 0.47474098205566406,
2611
+ "learning_rate": 1.492855875400534e-05,
2612
+ "loss": 0.5699,
2613
+ "step": 372
2614
  }
2615
  ],
2616
  "logging_steps": 1,
 
2630
  "attributes": {}
2631
  }
2632
  },
2633
+ "total_flos": 6.386815140070687e+17,
2634
  "train_batch_size": 4,
2635
  "trial_name": null,
2636
  "trial_params": null