thomwolf HF staff commited on
Commit
e5bf487
·
1 Parent(s): a23ef52
README.md CHANGED
@@ -6,6 +6,9 @@ colorTo: purple
6
  sdk: static
7
  pinned: false
8
  license: apache-2.0
 
 
 
9
  ---
10
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
6
  sdk: static
7
  pinned: false
8
  license: apache-2.0
9
+ header: mini
10
+ app_file: dist/index.html
11
+ thumbnail:
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
dist/bibliography.bib ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ @article{radford2019language,
2
+ title={Language Models are Unsupervised Multitask Learners},
3
+ author={Radford, Alec and Wu, Jeff and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya},
4
+ year={2019}
5
+ }
dist/distill.bundle.js ADDED
The diff for this file is too large to render. See raw diff
 
dist/distill.bundle.js.map ADDED
The diff for this file is too large to render. See raw diff
 
dist/index.html ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <script src="distill.bundle.js" type="module" fetchpriority="high" blocking></script>
5
+ <script src="main.bundle.js" type="module" fetchpriority="low" defer></script>
6
+ <meta name="viewport" content="width=device-width, initial-scale=1">
7
+ <meta charset="utf8">
8
+ <base target="_blank">
9
+ <title>FineWeb: decanting the web for the finest text data at scale</title>
10
+ <link rel="stylesheet" href="style.css">
11
+ </head>
12
+
13
+ <body>
14
+ <d-front-matter>
15
+ <script id='distill-front-matter' type="text/json">{
16
+ "title": "Nanotron Gigablogpost",
17
+ "description": "This blog covers everything.",
18
+ "published": "May 28, 2024",
19
+ "affiliation": {"name": "HuggingFace"},
20
+ "authors": [
21
+ {
22
+ "author":"John Doe",
23
+ "authorURL":"https://huggingface.co/"
24
+ },
25
+ ],
26
+ "katex": {
27
+ "delimiters": [
28
+ {"left": "$$", "right": "$$", "display": false}
29
+ ]
30
+ }
31
+ }
32
+ </script>
33
+ </d-front-matter>
34
+ <d-title>
35
+ <h1 class="l-page" style="text-align: center;">Nanotron Gigablogpost</h1>
36
+ <div id="title-plot" class="main-plot-container l-screen">
37
+ <figure>
38
+ <img src="assets/images/banner.png" alt="FineWeb">
39
+ </figure>
40
+ <div id="clusters-plot">
41
+ <img src="assets/images/clusters.png" alt="Clusters">
42
+ </div>
43
+ </div>
44
+ </d-title>
45
+ <d-byline></d-byline>
46
+ <d-article>
47
+ <d-contents>
48
+ </d-contents>
49
+
50
+ <p>The performance of a large language model (LLM) depends heavily on the quality and size of its pretraining framework.</p>
51
+ </d-article>
52
+
53
+ <d-appendix>
54
+ <d-bibliography src="bibliography.bib"></d-bibliography>
55
+ </d-appendix>
56
+
57
+ <script>
58
+ const article = document.querySelector('d-article');
59
+ const toc = document.querySelector('d-contents');
60
+ if (toc) {
61
+ const headings = article.querySelectorAll('h2, h3, h4');
62
+ let ToC = `<nav role="navigation" class="l-text figcaption"><h3>Table of contents</h3>`;
63
+ let prevLevel = 0;
64
+
65
+ for (const el of headings) {
66
+ // should element be included in TOC?
67
+ const isInTitle = el.parentElement.tagName == 'D-TITLE';
68
+ const isException = el.getAttribute('no-toc');
69
+ if (isInTitle || isException) continue;
70
+ el.setAttribute('id', el.textContent.toLowerCase().replaceAll(" ", "_"))
71
+ const link = '<a target="_self" href="' + '#' + el.getAttribute('id') + '">' + el.textContent + '</a>';
72
+
73
+ const level = el.tagName === 'H2' ? 0 : (el.tagName === 'H3' ? 1 : 2);
74
+ while (prevLevel < level) {
75
+ ToC += '<ul>'
76
+ prevLevel++;
77
+ }
78
+ while (prevLevel > level) {
79
+ ToC += '</ul>'
80
+ prevLevel--;
81
+ }
82
+ if (level === 0)
83
+ ToC += '<div>' + link + '</div>';
84
+ else
85
+ ToC += '<li>' + link + '</li>';
86
+ }
87
+
88
+ while (prevLevel > 0) {
89
+ ToC += '</ul>'
90
+ prevLevel--;
91
+ }
92
+ ToC += '</nav>';
93
+ toc.innerHTML = ToC;
94
+ toc.setAttribute('prerendered', 'true');
95
+ const toc_links = document.querySelectorAll('d-contents > nav a');
96
+
97
+ window.addEventListener('scroll', (_event) => {
98
+ if (typeof (headings) != 'undefined' && headings != null && typeof (toc_links) != 'undefined' && toc_links != null) {
99
+ // Then iterate forwards, on the first match highlight it and break
100
+ find_active: {
101
+ for (let i = headings.length - 1; i >= 0; i--) {
102
+ if (headings[i].getBoundingClientRect().top - 50 <= 0) {
103
+ if (!toc_links[i].classList.contains("active")) {
104
+ toc_links.forEach((link, _index) => {
105
+ link.classList.remove("active");
106
+ });
107
+ toc_links[i].classList.add('active');
108
+ }
109
+ break find_active;
110
+ }
111
+ }
112
+ toc_links.forEach((link, _index) => {
113
+ link.classList.remove("active");
114
+ });
115
+ }
116
+ }
117
+ });
118
+ }
119
+ </script>
120
+ </body>
121
+ </html>
dist/main.bundle.js ADDED
The diff for this file is too large to render. See raw diff
 
dist/main.bundle.js.LICENSE.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* @license
2
+ Papa Parse
3
+ v5.4.1
4
+ https://github.com/mholt/PapaParse
5
+ License: MIT
6
+ */
7
+
8
+ /*! For license information please see plotly-basic.min.js.LICENSE.txt */
9
+
10
+ /*! regenerator-runtime -- Copyright (c) 2014-present, Facebook, Inc. -- license (MIT): https://github.com/facebook/regenerator/blob/main/LICENSE */
11
+
12
+ /**
13
+ * @license
14
+ * Lodash <https://lodash.com/>
15
+ * Copyright OpenJS Foundation and other contributors <https://openjsf.org/>
16
+ * Released under MIT license <https://lodash.com/license>
17
+ * Based on Underscore.js 1.8.3 <http://underscorejs.org/LICENSE>
18
+ * Copyright Jeremy Ashkenas, DocumentCloud and Investigative Reporters & Editors
19
+ */
dist/main.bundle.js.map ADDED
The diff for this file is too large to render. See raw diff
 
dist/style.css ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* style.css */
2
+ /* Define colors */
3
+ :root {
4
+ --distill-gray: rgb(107, 114, 128);
5
+ --distill-gray-light: rgb(185, 185, 185);
6
+ --distill-gray-lighter: rgb(228, 228, 228);
7
+ --distill-gray-lightest: rgb(245, 245, 245);
8
+ --distill-blue: #007BFF;
9
+ }
10
+
11
+ /* Container for the controls */
12
+ [id^="plot-"] {
13
+ display: flex;
14
+ flex-direction: column;
15
+ align-items: center;
16
+ gap: 15px; /* Adjust the gap between controls as needed */
17
+ }
18
+ [id^="plot-"] figure {
19
+ margin-bottom: 0px;
20
+ margin-top: 0px;
21
+ padding: 0px;
22
+ }
23
+
24
+ .plotly_caption {
25
+ font-style: italic;
26
+ margin-top: 10px;
27
+ }
28
+
29
+ .plotly_controls {
30
+ display: flex;
31
+ flex-wrap: wrap;
32
+ flex-direction: row;
33
+ justify-content: center;
34
+ align-items: flex-start;
35
+ gap: 30px;
36
+ }
37
+
38
+
39
+ .plotly_input_container {
40
+ display: flex;
41
+ align-items: center;
42
+ flex-direction: column;
43
+ gap: 10px;
44
+ }
45
+
46
+ /* Style for the select dropdown */
47
+ .plotly_input_container > select {
48
+ padding: 2px 4px;
49
+ /* border: 1px solid #ccc; */
50
+ line-height: 1.5em;
51
+ text-align: center;
52
+ border-radius: 4px;
53
+ font-size: 12px;
54
+ background-color: var(--distill-gray-lightest);
55
+ outline: none;
56
+ }
57
+
58
+ /* Style for the range input */
59
+
60
+ .plotly_slider {
61
+ display: flex;
62
+ align-items: center;
63
+ gap: 10px;
64
+ }
65
+
66
+ .plotly_slider > input[type="range"] {
67
+ -webkit-appearance: none;
68
+ height: 2px;
69
+ background: var(--distill-gray-light);
70
+ border-radius: 5px;
71
+ outline: none;
72
+ }
73
+
74
+ .plotly_slider > span {
75
+ font-size: 14px;
76
+ line-height: 1.6em;
77
+ min-width: 16px;
78
+ }
79
+
80
+ .plotly_slider > input[type="range"]::-webkit-slider-thumb {
81
+ -webkit-appearance: none;
82
+ appearance: none;
83
+ width: 18px;
84
+ height: 18px;
85
+ border-radius: 50%;
86
+ background: var(--distill-blue);
87
+ cursor: pointer;
88
+ }
89
+
90
+ .plotly_slider > input[type="range"]::-moz-range-thumb {
91
+ width: 18px;
92
+ height: 18px;
93
+ border-radius: 50%;
94
+ background: var(--distill-blue);
95
+ cursor: pointer;
96
+ }
97
+
98
+ /* Style for the labels */
99
+ .plotly_input_container > label {
100
+ font-size: 14px;
101
+ font-weight: bold;
102
+ }
103
+
104
+ .main-plot-container {
105
+ margin-top: 21px;
106
+ margin-bottom: 35px;
107
+ }
108
+
109
+ .main-plot-container > figure {
110
+ display: block !important;
111
+ /* Let this be handled by graph-container */
112
+ margin-bottom: 0px;
113
+ margin-top: 0px;
114
+ }
115
+ .main-plot-container > div {
116
+ display: none !important;
117
+ }
118
+
119
+
120
+ @media (min-width: 768px) {
121
+ .main-plot-container > figure {
122
+ display: none !important;
123
+ }
124
+ .main-plot-container > div {
125
+ display: flex !important;
126
+ }
127
+ }
128
+
129
+ d-byline .byline {
130
+ grid-template-columns: 1fr;
131
+ grid-column: text;
132
+ font-size: 0.9rem;
133
+ line-height: 1.8em;
134
+ }
135
+
136
+ @media (min-width: 768px) {
137
+ d-byline .byline {
138
+ grid-template-columns: 5fr 1fr 1fr;
139
+ }
140
+ }
141
+
142
+ #title-plot {
143
+ margin-top: 0px;
144
+ margin-bottom: 0px;
145
+ }
146
+
147
+ d-contents > nav a.active {
148
+ text-decoration: underline;
149
+ }
150
+
151
+ @media (max-width: 1199px) {
152
+ d-contents {
153
+ display: none;
154
+ justify-self: start;
155
+ align-self: start;
156
+ padding-bottom: 0.5em;
157
+ margin-bottom: 1em;
158
+ padding-left: 0.25em;
159
+ border-bottom: 1px solid rgba(0, 0, 0, 0.1);
160
+ border-bottom-width: 1px;
161
+ border-bottom-style: solid;
162
+ border-bottom-color: rgba(0, 0, 0, 0.1);
163
+ }
164
+ }
165
+
166
+ d-contents a:hover {
167
+ border-bottom: none;
168
+ }
169
+
170
+
171
+ @media (min-width: 1200px) {
172
+ d-article {
173
+ /* Ensure d-article does not prevent sticky positioning */
174
+ overflow: visible;
175
+ }
176
+
177
+ d-contents {
178
+ align-self: start;
179
+ grid-column-start: 1 !important;
180
+ grid-column-end: 4 !important;
181
+ grid-row: auto / span 6;
182
+ justify-self: end;
183
+ margin-top: 0em;
184
+ padding-right: 3em;
185
+ padding-left: 2em;
186
+ border-right: 1px solid rgba(0, 0, 0, 0.1);
187
+ border-right-width: 1px;
188
+ border-right-style: solid;
189
+ border-right-color: rgba(0, 0, 0, 0.1);
190
+ position: -webkit-sticky; /* For Safari */
191
+ position: sticky;
192
+ top: 10px; /* Adjust this value if needed */
193
+ }
194
+ }
195
+
196
+ d-contents nav h3 {
197
+ margin-top: 0;
198
+ margin-bottom: 1em;
199
+ }
200
+
201
+ d-contents nav div {
202
+ color: rgba(0, 0, 0, 0.8);
203
+ font-weight: bold;
204
+ }
205
+
206
+ d-contents nav a {
207
+ color: rgba(0, 0, 0, 0.8);
208
+ border-bottom: none;
209
+ text-decoration: none;
210
+ }
211
+
212
+ d-contents li {
213
+ list-style-type: none;
214
+ }
215
+
216
+ d-contents ul, d-article d-contents ul {
217
+ padding-left: 1em;
218
+ }
219
+
220
+ d-contents nav ul li {
221
+ margin-bottom: .25em;
222
+ }
223
+
224
+ d-contents nav a:hover {
225
+ text-decoration: underline solid rgba(0, 0, 0, 0.6);
226
+ }
227
+
228
+ d-contents nav ul {
229
+ margin-top: 0;
230
+ margin-bottom: 6px;
231
+ }
232
+
233
+
234
+ d-contents nav > div {
235
+ display: block;
236
+ outline: none;
237
+ margin-bottom: 0.5em;
238
+ }
239
+
240
+ d-contents nav > div > a {
241
+ font-size: 13px;
242
+ font-weight: 600;
243
+ }
244
+
245
+ d-article aside {
246
+ margin-bottom: 1em;
247
+ }
248
+
249
+ @media (min-width: 768px) {
250
+ d-article aside {
251
+ margin-bottom: 0;
252
+ }
253
+ }
254
+
255
+ d-contents nav > div > a:hover,
256
+ d-contents nav > ul > li > a:hover {
257
+ text-decoration: none;
258
+ }
259
+
index.html DELETED
@@ -1,19 +0,0 @@
1
- <!doctype html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>My static Space</title>
7
- <link rel="stylesheet" href="style.css" />
8
- </head>
9
- <body>
10
- <div class="card">
11
- <h1>Welcome to your static Space!</h1>
12
- <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
- <p>
14
- Also don't forget to check the
15
- <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
- </p>
17
- </div>
18
- </body>
19
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
package-lock.json ADDED
The diff for this file is too large to render. See raw diff
 
package.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dependencies": {
3
+ "lodash": "^4.17.21",
4
+ "papaparse": "^5.4.1",
5
+ "plotly.js-basic-dist-min": "^2.33.0"
6
+ },
7
+ "name": "blogpost",
8
+ "version": "1.0.0",
9
+ "description": "--- title: \"The Nanotron Gigablogpost\" emoji: 🍷 colorFrom: pink colorTo: red sdk: static pinned: false header: mini ---",
10
+ "main": "index.js",
11
+ "scripts": {
12
+ "dev": "webpack serve --open",
13
+ "build": "NODE_ENV=production webpack"
14
+ },
15
+ "author": "",
16
+ "license": "ISC",
17
+ "devDependencies": {
18
+ "@babel/preset-env": "^7.24.6",
19
+ "babel-loader": "^9.1.3",
20
+ "clean-webpack-plugin": "^4.0.0",
21
+ "compression-webpack-plugin": "^11.1.0",
22
+ "copy-webpack-plugin": "^12.0.2",
23
+ "css-loader": "^7.1.2",
24
+ "html-webpack-change-assets-extension-plugin": "^1.3.1",
25
+ "html-webpack-plugin": "^5.6.0",
26
+ "style-loader": "^4.0.0",
27
+ "webpack": "^5.91.0",
28
+ "webpack-bundle-analyzer": "^4.10.2",
29
+ "webpack-cli": "^5.1.4",
30
+ "webpack-dev-server": "^5.0.4"
31
+ }
32
+ }
style.css DELETED
@@ -1,28 +0,0 @@
1
- body {
2
- padding: 2rem;
3
- font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
4
- }
5
-
6
- h1 {
7
- font-size: 16px;
8
- margin-top: 0;
9
- }
10
-
11
- p {
12
- color: rgb(107, 114, 128);
13
- font-size: 15px;
14
- margin-bottom: 10px;
15
- margin-top: 5px;
16
- }
17
-
18
- .card {
19
- max-width: 620px;
20
- margin: 0 auto;
21
- padding: 16px;
22
- border: 1px solid lightgray;
23
- border-radius: 16px;
24
- }
25
-
26
- .card p:last-child {
27
- margin-bottom: 0;
28
- }